From 3195bf00e08c343205058a5079c5ab8bc6841035 Mon Sep 17 00:00:00 2001
From: Zimin Li <coollizimin@gmail.com>
Date: Thu, 16 Jan 2025 17:21:42 +0800
Subject: [PATCH] Refactored the tests with profiling

---
 operatorspy/tests/add.py              |  70 ++++--
 operatorspy/tests/attention.py        | 190 ++++++++++------
 operatorspy/tests/avg_pool.py         | 100 ++++----
 operatorspy/tests/causal_softmax.py   |  81 +++++--
 operatorspy/tests/conv.py             | 104 +++++----
 operatorspy/tests/expand.py           |  81 ++++---
 operatorspy/tests/gemm.py             | 103 +++++----
 operatorspy/tests/global_avg_pool.py  |  94 ++++----
 operatorspy/tests/max_pool.py         | 100 ++++----
 operatorspy/tests/mlp.py              | 175 +++++++++-----
 operatorspy/tests/random_sample.py    |  97 ++++++--
 operatorspy/tests/rearrange.py        |  80 +++++--
 operatorspy/tests/relu.py             |  78 ++++---
 operatorspy/tests/rms_norm.py         | 107 +++++++--
 operatorspy/tests/rotary_embedding.py | 117 ++++++++--
 operatorspy/tests/swiglu.py           | 315 ++++++++++++++------------
 16 files changed, 1242 insertions(+), 650 deletions(-)

diff --git a/operatorspy/tests/add.py b/operatorspy/tests/add.py
index 455014cc..cce981bf 100644
--- a/operatorspy/tests/add.py
+++ b/operatorspy/tests/add.py
@@ -2,6 +2,7 @@
 import ctypes
 import sys
 import os
+import time
 
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
 from operatorspy import (
@@ -15,10 +16,13 @@
     check_error,
 )
 
-from operatorspy.tests.test_utils import get_args
+from operatorspy.tests.test_utils import get_args, synchronize_device
 from enum import Enum, auto
 import torch
 
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
 
 class Inplace(Enum):
     OUT_OF_PLACE = auto()
@@ -83,36 +87,65 @@ def test(
     check_error(
         lib.infiniopAdd(descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None)
     )
+
     assert torch.allclose(c, ans, atol=0, rtol=1e-3)
+
+    if PROFILE:
+        # Profiling PyTorch implementation
+        for i in range(NUM_PRERUN):
+            _ = add(a, b)
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = add(a, b)
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f" pytorch time: {elapsed * 1000 :6f} ms")
+
+        # Profiling C Operators implementation
+        for i in range(NUM_PRERUN):
+            check_error(
+                lib.infiniopAdd(descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None)
+            )
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopAdd(descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None)
+            )
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"     lib time: {elapsed * 1000 :6f} ms")
+
     check_error(lib.infiniopDestroyAddDescriptor(descriptor))
 
 
-def test_cpu(lib, test_cases):
+def test_cpu(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
     for c_shape, a_shape, b_shape, inplace in test_cases:
-        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
-        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=tensor_dtype, inplace=inplace)
     destroy_handle(lib, handle)
 
 
-def test_cuda(lib, test_cases):
+def test_cuda(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
     for c_shape, a_shape, b_shape, inplace in test_cases:
-        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
-        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=tensor_dtype, inplace=inplace)
     destroy_handle(lib, handle)
 
 
-def test_bang(lib, test_cases):
+def test_bang(lib, test_cases, tensor_dtypes):
     import torch_mlu
 
     device = DeviceEnum.DEVICE_BANG
     handle = create_handle(lib, device)
     for c_shape, a_shape, b_shape, inplace in test_cases:
-        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
-        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=tensor_dtype, inplace=inplace)
     destroy_handle(lib, handle)
 
 
@@ -126,14 +159,17 @@ def test_bang(lib, test_cases):
         ((), (), (), Inplace.OUT_OF_PLACE),
         ((3, 3), (3, 3), (3, 3), Inplace.OUT_OF_PLACE),
         ((2, 20, 3), (2, 1, 3), (2, 20, 3), Inplace.OUT_OF_PLACE),
-        ((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_A),
-        ((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_B),
+        ((3, 20, 512), (3, 20, 512), (3, 20, 512), Inplace.INPLACE_A),
+        ((3, 20, 512), (3, 20, 512), (3, 20, 512), Inplace.INPLACE_B),
         ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
         ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
         ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE),
         ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE),
         ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE),
     ]
+    tensor_dtypes = [
+        torch.float16, torch.float32,
+    ]
     args = get_args()
     lib = open_lib()
     lib.infiniopCreateAddDescriptor.restype = c_int32
@@ -157,12 +193,14 @@ def test_bang(lib, test_cases):
         infiniopAddDescriptor_t,
     ]
 
+    if args.profile:
+        PROFILE = True
     if args.cpu:
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     if args.cuda:
-        test_cuda(lib, test_cases)
+        test_cuda(lib, test_cases, tensor_dtypes)
     if args.bang:
-        test_bang(lib, test_cases)
+        test_bang(lib, test_cases, tensor_dtypes)
     if not (args.cpu or args.cuda or args.bang):
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/attention.py b/operatorspy/tests/attention.py
index f5449aaa..2225bc6d 100644
--- a/operatorspy/tests/attention.py
+++ b/operatorspy/tests/attention.py
@@ -2,6 +2,7 @@
 import ctypes
 import sys
 import os
+import time 
 
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
 from operatorspy import (
@@ -18,10 +19,13 @@
     create_workspace,
 )
 
-from operatorspy.tests.test_utils import get_args
+from operatorspy.tests.test_utils import get_args, synchronize_device
 import torch
 import torch.nn.functional as F
 
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
 
 class AttentionDescriptor(Structure):
     _fields_ = [("device", c_int32)]
@@ -186,10 +190,59 @@ def test(
 
     assert torch.allclose(out, ans, atol=1e-4, rtol=1e-2)
 
+    if PROFILE:
+        # Profiling PyTorch implementation
+        for i in range(NUM_PRERUN):
+            _ = attention(q, k, v, k_cache, v_cache, pos)
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = attention(q, k, v, k_cache, v_cache, pos)
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f" pytorch time: {elapsed * 1000 :6f} ms")
+
+        # Profiling C Operators implementation
+        for i in range(NUM_PRERUN):
+            check_error(
+                lib.infiniopAttention(
+                    descriptor,
+                    workspace.data_ptr() if workspace is not None else None,
+                    workspace_size.value,
+                    out_tensor.data,
+                    q_tensor.data,
+                    k_tensor.data,
+                    v_tensor.data,
+                    k_cache_tensor.data,
+                    v_cache_tensor.data,
+                    None,
+                )
+            )
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopAttention(
+                    descriptor,
+                    workspace.data_ptr() if workspace is not None else None,
+                    workspace_size.value,
+                    out_tensor.data,
+                    q_tensor.data,
+                    k_tensor.data,
+                    v_tensor.data,
+                    k_cache_tensor.data,
+                    v_cache_tensor.data,
+                    None,
+                )
+            )
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"     lib time: {elapsed * 1000 :6f} ms")
+
     check_error(lib.infiniopDestroyAttentionDescriptor(descriptor))
 
 
-def test_cpu(lib, test_cases):
+def test_cpu(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
 
@@ -201,36 +254,36 @@ def test_cpu(lib, test_cases):
         pos,
         k_cache_buf_len,
         v_cache_buf_len,
-        dtype,
         q_stride,
         k_stride,
         v_stride,
         k_cache_stride,
         v_cache_stride,
     ) in test_cases:
-        test(
-            lib,
-            handle,
-            "cpu",
-            n_q_head,
-            n_kv_head,
-            seq_len,
-            head_dim,
-            pos,
-            k_cache_buf_len,
-            v_cache_buf_len,
-            dtype,
-            q_stride,
-            k_stride,
-            v_stride,
-            k_cache_stride,
-            v_cache_stride,
-        )
+        for tensor_dtype in tensor_dtypes:
+            test(
+                lib,
+                handle,
+                "cpu",
+                n_q_head,
+                n_kv_head,
+                seq_len,
+                head_dim,
+                pos,
+                k_cache_buf_len,
+                v_cache_buf_len,
+                tensor_dtype,
+                q_stride,
+                k_stride,
+                v_stride,
+                k_cache_stride,
+                v_cache_stride,
+            )
 
     destroy_handle(lib, handle)
 
 
-def test_cuda(lib, test_cases):
+def test_cuda(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
 
@@ -242,36 +295,36 @@ def test_cuda(lib, test_cases):
         pos,
         k_cache_buf_len,
         v_cache_buf_len,
-        dtype,
         q_stride,
         k_stride,
         v_stride,
         k_cache_stride,
         v_cache_stride,
     ) in test_cases:
-        test(
-            lib,
-            handle,
-            "cuda",
-            n_q_head,
-            n_kv_head,
-            seq_len,
-            head_dim,
-            pos,
-            k_cache_buf_len,
-            v_cache_buf_len,
-            dtype,
-            q_stride,
-            k_stride,
-            v_stride,
-            k_cache_stride,
-            v_cache_stride,
-        )
+        for tensor_dtype in tensor_dtypes:
+            test(
+                lib,
+                handle,
+                "cuda",
+                n_q_head,
+                n_kv_head,
+                seq_len,
+                head_dim,
+                pos,
+                k_cache_buf_len,
+                v_cache_buf_len,
+                tensor_dtype,
+                q_stride,
+                k_stride,
+                v_stride,
+                k_cache_stride,
+                v_cache_stride,
+            )
 
     destroy_handle(lib, handle)
 
 
-def test_bang(lib, test_cases):
+def test_bang(lib, test_cases, tensor_dtypes):
     import torch_mlu
 
     device = DeviceEnum.DEVICE_BANG
@@ -292,24 +345,25 @@ def test_bang(lib, test_cases):
         k_cache_stride,
         v_cache_stride,
     ) in test_cases:
-        test(
-            lib,
-            handle,
-            "mlu",
-            n_q_head,
-            n_kv_head,
-            seq_len,
-            head_dim,
-            pos,
-            k_cache_buf_len,
-            v_cache_buf_len,
-            dtype,
-            q_stride,
-            k_stride,
-            v_stride,
-            k_cache_stride,
-            v_cache_stride,
-        )
+        for tensor_dtype in tensor_dtypes:
+            test(
+                lib,
+                handle,
+                "mlu",
+                n_q_head,
+                n_kv_head,
+                seq_len,
+                head_dim,
+                pos,
+                k_cache_buf_len,
+                v_cache_buf_len,
+                dtype,
+                q_stride,
+                k_stride,
+                v_stride,
+                k_cache_stride,
+                v_cache_stride,
+            )
 
     destroy_handle(lib, handle)
 
@@ -325,7 +379,6 @@ def test_bang(lib, test_cases):
             0,  # pos
             2048,  # k_cache_buf_len
             2048,  # v_cache_buf_len
-            torch.float16,  # dtype
             [64, 2560, 1],  # q_stride
             [64, 2560, 1],  # k_stride
             [64, 2560, 1],  # v_stride
@@ -341,7 +394,6 @@ def test_bang(lib, test_cases):
             3,  # pos
             2048,  # k_cache_buf_len
             2048,  # v_cache_buf_len
-            torch.float16,  # dtype
             [64, 2560, 1],  # q_stride
             [64, 2560, 1],  # k_stride
             [64, 2560, 1],  # v_stride
@@ -357,7 +409,6 @@ def test_bang(lib, test_cases):
             1,  # pos
             8,  # k_cache_buf_len
             8,  # v_cache_buf_len
-            torch.float16,  # dtype
             None,  # q_stride
             None,  # k_stride
             None,  # v_stride
@@ -365,6 +416,9 @@ def test_bang(lib, test_cases):
             None,  # v_cache_stride
         ),
     ]
+    tensor_dtypes = [
+        torch.float16,
+    ]
     args = get_args()
     lib = open_lib()
 
@@ -406,12 +460,14 @@ def test_bang(lib, test_cases):
         infiniopAttentionDescriptor_t,
     ]
 
+    if args.profile:
+        PROFILE = True
     if args.cpu:
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     if args.cuda:
-        test_cuda(lib, test_cases)
+        test_cuda(lib, test_cases, tensor_dtypes)
     if args.bang:
-        test_bang(lib, test_cases)
+        test_bang(lib, test_cases, tensor_dtypes)
     if not (args.cpu or args.cuda or args.bang):
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/avg_pool.py b/operatorspy/tests/avg_pool.py
index 9c240789..f0530327 100644
--- a/operatorspy/tests/avg_pool.py
+++ b/operatorspy/tests/avg_pool.py
@@ -16,13 +16,10 @@
     check_error,
 )
 
-from operatorspy.tests.test_utils import get_args
+from operatorspy.tests.test_utils import get_args, synchronize_device
 import torch
 from typing import Tuple
 
-# constant for control whether profile the pytorch and lib functions
-# NOTE: need to manually add synchronization function to the lib function,
-#       e.g., cudaDeviceSynchronize() for CUDA
 PROFILE = False
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
@@ -51,8 +48,6 @@ def pool(x, k, padding, stride, dilation = 1):
         ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x.to(torch.float32)).to(torch.float16)
     else:
         ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x)
-    if PROFILE:
-        torch.cuda.synchronize()
     return ans
 
 
@@ -92,14 +87,7 @@ def test(
     x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
     y = torch.rand(inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype).to(torch_device)
 
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        ans = pool(x, k_shape, padding, strides)
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            _ = pool(x, k_shape, padding, strides)
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"pytorch time: {elapsed :6f}")
+    ans = pool(x, k_shape, padding, strides)
 
     x_tensor = to_tensor(x, lib)
     y_tensor = to_tensor(y, lib)
@@ -129,18 +117,47 @@ def test(
     workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
     workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
 
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        check_error(
-            lib.infiniopAvgPool(
-                descriptor,
-                workspace_ptr,
-                workspaceSize,
-                y_tensor.data,
-                x_tensor.data,
-                None,
-            )
+    check_error(
+        lib.infiniopAvgPool(
+            descriptor,
+            workspace_ptr,
+            workspaceSize,
+            y_tensor.data,
+            x_tensor.data,
+            None,
         )
+    )
+
+    if (tensor_dtype == torch.float16):
+        assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+    else:
+        assert torch.allclose(y, ans, atol=1e-6, rtol=1e-5)
+
     if PROFILE:
+        # Profiling PyTorch implementation
+        for i in range(NUM_PRERUN):
+            _ = pool(x, k_shape, padding, strides)
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = pool(x, k_shape, padding, strides)
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f" pytorch time: {elapsed * 1000 :6f} ms")
+
+        # Profiling C Operators implementation
+        for i in range(NUM_PRERUN):
+            check_error(
+                lib.infiniopAvgPool(
+                    descriptor,
+                    workspace_ptr,
+                    workspaceSize,
+                    y_tensor.data,
+                    x_tensor.data,
+                    None,
+                )
+            )
+        synchronize_device(torch_device)
         start_time = time.time()
         for i in range(NUM_ITERATIONS):
             check_error(
@@ -153,39 +170,39 @@ def test(
                     None,
                 )
             )
+        synchronize_device(torch_device)
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"    lib time: {elapsed :6f}")
+        print(f"     lib time: {elapsed * 1000 :6f} ms")
 
-    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
     check_error(lib.infiniopDestroyAvgPoolDescriptor(descriptor))
 
 
-def test_cpu(lib, test_cases):
+def test_cpu(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
     for x_shape, kernel_shape, padding, strides in test_cases:
-        test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
-        test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=tensor_dtype)
     destroy_handle(lib, handle)
 
 
-def test_cuda(lib, test_cases):
+def test_cuda(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
     for x_shape, kernel_shape, padding, strides in test_cases:
-        test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
-        test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=tensor_dtype)
     destroy_handle(lib, handle)
 
 
-def test_bang(lib, test_cases):
+def test_bang(lib, test_cases, tensor_dtypes):
     import torch_mlu
 
     device = DeviceEnum.DEVICE_BANG
     handle = create_handle(lib, device)
     for x_shape, kernel_shape, padding, strides in test_cases:
-        test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
-        test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=tensor_dtype)
     destroy_handle(lib, handle)
 
 
@@ -196,6 +213,9 @@ def test_bang(lib, test_cases):
         ((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)),
         ((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)),
     ]
+    tensor_dtypes = [
+        torch.float16, torch.float32,
+    ]
     args = get_args()
     lib = open_lib()
     lib.infiniopCreateAvgPoolDescriptor.restype = c_int32
@@ -228,12 +248,14 @@ def test_bang(lib, test_cases):
         infiniopAvgPoolDescriptor_t,
     ]
 
+    if args.profile:
+        PROFILE = True
     if args.cpu:
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     if args.cuda:
-        test_cuda(lib, test_cases)
+        test_cuda(lib, test_cases, tensor_dtypes)
     if args.bang:
-        test_bang(lib, test_cases)
+        test_bang(lib, test_cases, tensor_dtypes)
     if not (args.cpu or args.cuda or args.bang):
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/causal_softmax.py b/operatorspy/tests/causal_softmax.py
index 1ad304b2..726bd5ca 100644
--- a/operatorspy/tests/causal_softmax.py
+++ b/operatorspy/tests/causal_softmax.py
@@ -2,6 +2,7 @@
 import ctypes
 import sys
 import os
+import time
 
 
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
@@ -18,9 +19,12 @@
     create_workspace,
 )
 
-from operatorspy.tests.test_utils import get_args
+from operatorspy.tests.test_utils import get_args, synchronize_device
 import torch
 
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
 
 class CausalSoftmaxDescriptor(Structure):
     _fields_ = [("device", c_int32)]
@@ -73,41 +77,85 @@ def test(lib, handle, torch_device, x_shape, x_stride=None, x_dtype=torch.float1
         )
     )
     assert torch.allclose(x, ans, atol=0, rtol=1e-2)
+
+    if PROFILE:
+        # Profiling PyTorch implementation
+        for i in range(NUM_PRERUN):
+            _ = causal_softmax(x)
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = causal_softmax(x)
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f" pytorch time: {elapsed * 1000 :6f} ms")
+
+        # Profiling C Operators implementation
+        for i in range(NUM_PRERUN):
+            check_error(
+                lib.infiniopCausalSoftmax(
+                    descriptor,
+                    workspace.data_ptr() if workspace is not None else None,
+                    workspace_size.value,
+                    x_tensor.data,
+                    None,
+                )
+            )
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopCausalSoftmax(
+                    descriptor,
+                    workspace.data_ptr() if workspace is not None else None,
+                    workspace_size.value,
+                    x_tensor.data,
+                    None,
+                )
+            )
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"     lib time: {elapsed * 1000 :6f} ms")
+
     check_error(lib.infiniopDestroyCausalSoftmaxDescriptor(descriptor))
 
 
-def test_cpu(lib, test_cases):
+def test_cpu(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
     for x_shape, x_stride in test_cases:
-        test(lib, handle, "cpu", x_shape, x_stride)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "cpu", x_shape, x_stride, x_dtype=tensor_dtype)
     destroy_handle(lib, handle)
 
 
-def test_cuda(lib, test_cases):
+def test_cuda(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
     for x_shape, x_stride in test_cases:
-        test(lib, handle, "cuda", x_shape, x_stride)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "cuda", x_shape, x_stride, x_dtype=tensor_dtype)
     destroy_handle(lib, handle)
 
 
-def test_bang(lib, test_cases):
+def test_bang(lib, test_cases, tensor_dtypes):
     import torch_mlu
 
     device = DeviceEnum.DEVICE_BANG
     handle = create_handle(lib, device)
     for x_shape, x_stride in test_cases:
-        test(lib, handle, "mlu", x_shape, x_stride)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "mlu", x_shape, x_stride, x_dtype=tensor_dtype)
     destroy_handle(lib, handle)
 
-def test_ascend(lib, test_cases):
+def test_ascend(lib, test_cases, tensor_dtypes):
     import torch_npu
 
     device = DeviceEnum.DEVICE_ASCEND
     handle = create_handle(lib, device)
     for x_shape, x_stride in test_cases:
-        test(lib, handle, "npu", x_shape, x_stride)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "npu", x_shape, x_stride, x_dtype=tensor_dtype)
 
     destroy_handle(lib, handle)
 
@@ -117,6 +165,9 @@ def test_ascend(lib, test_cases):
         ((32, 20, 512), None),
         ((32, 20, 512), (20480, 512, 1)), # Ascend 暂不支持非连续
     ]
+    tensor_dtypes = [
+        torch.float16,
+    ]
     args = get_args()
     lib = open_lib()
     lib.infiniopCreateCausalSoftmaxDescriptor.restype = c_int32
@@ -143,14 +194,16 @@ def test_ascend(lib, test_cases):
         infiniopCausalSoftmaxDescriptor_t,
     ]
 
+    if args.profile:
+        PROFILE = True
     if args.cpu:
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     if args.cuda:
-        test_cuda(lib, test_cases)
+        test_cuda(lib, test_cases, tensor_dtypes)
     if args.bang:
-        test_bang(lib, test_cases)
+        test_bang(lib, test_cases, tensor_dtypes)
     if args.ascend:
-        test_ascend(lib, test_cases)
+        test_ascend(lib, test_cases, tensor_dtypes)
     if not (args.cpu or args.cuda or args.bang or args.ascend):
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/conv.py b/operatorspy/tests/conv.py
index 7e7ea953..eac7869a 100644
--- a/operatorspy/tests/conv.py
+++ b/operatorspy/tests/conv.py
@@ -16,16 +16,13 @@
     check_error,
 )
 
-from operatorspy.tests.test_utils import get_args
+from operatorspy.tests.test_utils import get_args, synchronize_device
 import torch
 import math
 import ctypes
 from torch.nn import functional as F
 from typing import List, Tuple
 
-# constant for control whether profile the pytorch and lib functions
-# NOTE: need to manually add synchronization function to the lib function,
-#       e.g., cudaDeviceSynchronize() for CUDA
 PROFILE = False
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
@@ -108,14 +105,7 @@ def test(
         inferShape(x.shape, w.shape, pads, strides, dilations), dtype=tensor_dtype
     ).to(torch_device)
 
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        ans = conv(x, w, strides, pads, dilations)
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            _ = conv(x, w, strides, pads, dilations)
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"pytorch time: {elapsed :6f}")
+    ans = conv(x, w, strides, pads, dilations)
 
     x_tensor = to_tensor(x, lib)
     w_tensor = to_tensor(w, lib)
@@ -148,21 +138,37 @@ def test(
     workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
     workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
 
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        check_error(
-            lib.infiniopConv(
-                descriptor,
-                workspace_ptr,
-                workspaceSize,
-                y_tensor.data,
-                x_tensor.data,
-                w_tensor.data,
-                None,
-            )
+    check_error(
+        lib.infiniopConv(
+            descriptor,
+            workspace_ptr,
+            workspaceSize,
+            y_tensor.data,
+            x_tensor.data,
+            w_tensor.data,
+            None,
         )
+    )
+
+    if (tensor_dtype == torch.float16):
+        assert torch.allclose(y, ans, atol=0, rtol=1e-2)
+    else:
+        assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+
     if PROFILE:
+        # Profiling PyTorch implementation
+        for i in range(NUM_PRERUN):
+            _ = conv(x, w, strides, pads, dilations)
+        synchronize_device(torch_device)
         start_time = time.time()
         for i in range(NUM_ITERATIONS):
+            _ = conv(x, w, strides, pads, dilations)
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f" pytorch time: {elapsed * 1000 :6f} ms")
+
+        # Profiling C Operators implementation
+        for i in range(NUM_PRERUN):
             check_error(
                 lib.infiniopConv(
                     descriptor,
@@ -174,42 +180,53 @@ def test(
                     None,
                 )
             )
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopConv(
+                    descriptor,
+                    workspace_ptr,
+                    workspaceSize,
+                    y_tensor.data,
+                    x_tensor.data,
+                    w_tensor.data,
+                    None,
+                )
+            )
+        synchronize_device(torch_device)
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"    lib time: {elapsed :6f}")
+        print(f"     lib time: {elapsed * 1000 :6f} ms")
 
-    if (tensor_dtype == torch.float16):
-        assert torch.allclose(y, ans, atol=0, rtol=1e-2)
-    else:
-        assert torch.allclose(y, ans, atol=0, rtol=1e-3)
     check_error(lib.infiniopDestroyConvDescriptor(descriptor))
 
 
-def test_cpu(lib, test_cases):
+def test_cpu(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
     for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
-        test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
-        test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=tensor_dtype)
     destroy_handle(lib, handle)
 
 
-def test_cuda(lib, test_cases):
+def test_cuda(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
     for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
-        test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
-        test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=tensor_dtype)
     destroy_handle(lib, handle)
 
 
-def test_bang(lib, test_cases):
+def test_bang(lib, test_cases, tensor_dtypes):
     import torch_mlu
 
     device = DeviceEnum.DEVICE_BANG
     handle = create_handle(lib, device)
     for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
-        test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
-        test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=tensor_dtype)
     destroy_handle(lib, handle)
 
 
@@ -257,6 +274,9 @@ def test_bang(lib, test_cases):
             None,
         ),
     ]
+    tensor_dtypes = [
+        torch.float16, torch.float32,
+    ]
     args = get_args()
     lib = open_lib()
     lib.infiniopCreateConvDescriptor.restype = c_int32
@@ -286,12 +306,14 @@ def test_bang(lib, test_cases):
         infiniopConvDescriptor_t,
     ]
 
+    if args.profile:
+        PROFILE = True
     if args.cpu:
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     if args.cuda:
-        test_cuda(lib, test_cases)
+        test_cuda(lib, test_cases, tensor_dtypes)
     if args.bang:
-        test_bang(lib, test_cases)
+        test_bang(lib, test_cases, tensor_dtypes)
     if not (args.cpu or args.cuda or args.bang):
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/expand.py b/operatorspy/tests/expand.py
index e060ad73..fc5535b5 100644
--- a/operatorspy/tests/expand.py
+++ b/operatorspy/tests/expand.py
@@ -17,12 +17,9 @@
     rearrange_tensor,
 )
 
-from operatorspy.tests.test_utils import get_args
+from operatorspy.tests.test_utils import get_args, synchronize_device
 import torch
 
-# constant for control whether profile the pytorch and lib functions
-# NOTE: need to manually add synchronization function to the lib function,
-#       e.g., cudaDeviceSynchronize() for CUDA
 PROFILE = False
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
@@ -36,11 +33,7 @@ class ExpandDescriptor(Structure):
 
 
 def expand(x, y):
-    if PROFILE:
-        ans = x.expand_as(y).clone()
-        torch.cuda.synchronize()
-        return ans
-    return x.expand_as(y)
+    return x.expand_as(y).clone()
 
 
 def test(
@@ -65,14 +58,7 @@ def test(
     if y_stride is not None:
         y = rearrange_tensor(y, y_stride)
 
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        ans = expand(x, y)
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            _ = expand(x, y)
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"pytorch time: {elapsed :6f}")
+    ans = expand(x, y)
 
     x_tensor = to_tensor(x, lib)
     y_tensor = to_tensor(y, lib)
@@ -91,46 +77,62 @@ def test(
     x_tensor.descriptor.contents.invalidate()
     y_tensor.descriptor.contents.invalidate()
 
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        check_error(lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None))
+    check_error(lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None))
+
+    assert torch.allclose(y, ans, atol=0, rtol=0)
+
     if PROFILE:
+        # Profiling PyTorch implementation
+        for i in range(NUM_PRERUN):
+            _ = expand(x, y)
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = expand(x, y)
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f" pytorch time: {elapsed * 1000 :6f} ms")
+
+        # Profiling C Operators implementation
+        for i in range(NUM_PRERUN):
+            check_error(lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None))
+        synchronize_device(torch_device)
         start_time = time.time()
         for i in range(NUM_ITERATIONS):
-            check_error(
-                lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None)
-            )
+            check_error(lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None))
+        synchronize_device(torch_device)
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"    lib time: {elapsed :6f}")
-    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+        print(f"     lib time: {elapsed * 1000 :6f} ms")
+
     check_error(lib.infiniopDestroyExpandDescriptor(descriptor))
 
 
-def test_cpu(lib, test_cases):
+def test_cpu(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
     for y_shape, x_shape, y_stride, x_stride in test_cases:
-        test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
-        test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=tensor_dtype)
     destroy_handle(lib, handle)
 
 
-def test_cuda(lib, test_cases):
+def test_cuda(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
     for y_shape, x_shape, y_stride, x_stride in test_cases:
-        test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
-        test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=tensor_dtype)
     destroy_handle(lib, handle)
 
 
-def test_bang(lib, test_cases):
+def test_bang(lib, test_cases, tensor_dtypes):
     import torch_mlu
 
     device = DeviceEnum.DEVICE_BANG
     handle = create_handle(lib, device)
     for y_shape, x_shape, y_stride, x_stride in test_cases:
-        test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
-        test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=tensor_dtype)
     destroy_handle(lib, handle)
 
 
@@ -147,6 +149,9 @@ def test_bang(lib, test_cases):
         ((3, 2, 4, 5), (3, 2, 1, 1), None, None),
         ((32, 256, 112, 112), (32, 256, 112, 1), None, None),
     ]
+    tensor_dtypes = [
+        torch.float16, torch.float32,
+    ]
     args = get_args()
     lib = open_lib()
     lib.infiniopCreateExpandDescriptor.restype = c_int32
@@ -168,12 +173,14 @@ def test_bang(lib, test_cases):
         infiniopExpandDescriptor_t,
     ]
 
+    if args.profile:
+        PROFILE = True
     if args.cpu:
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     if args.cuda:
-        test_cuda(lib, test_cases)
+        test_cuda(lib, test_cases, tensor_dtypes)
     if args.bang:
-        test_bang(lib, test_cases)
+        test_bang(lib, test_cases, tensor_dtypes)
     if not (args.cpu or args.cuda or args.bang):
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/gemm.py b/operatorspy/tests/gemm.py
index 5da99eac..bee41b17 100644
--- a/operatorspy/tests/gemm.py
+++ b/operatorspy/tests/gemm.py
@@ -17,12 +17,9 @@
     rearrange_tensor,
 )
 
-from operatorspy.tests.test_utils import get_args
+from operatorspy.tests.test_utils import get_args, synchronize_device
 import torch
 
-# constant for control whether profile the pytorch and lib functions
-# NOTE: need to manually add synchronization function to the lib function,
-#       e.g., cudaDeviceSynchronize() for CUDA
 PROFILE = False
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
@@ -40,8 +37,6 @@ def gemm(A, B, C=None, transA=False, transB=False, alpha=1.0, beta=0.0, dtype=to
     result = alpha * torch.matmul(A if dtype != torch.float16 else A.to(torch.float32), B if dtype != torch.float16 else B.to(torch.float32)).to(dtype)
     if C is not None:
         result += beta * C if dtype != torch.float16 else C.to(torch.float32)
-    if PROFILE:
-        torch.cuda.synchronize()
     return result
 
 
@@ -83,14 +78,7 @@ def test(
     if y_stride is not None:
         y = rearrange_tensor(y, y_stride)
 
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        ans = gemm(a, b, c, transA, transB, alpha, beta, dtype)
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            _ = gemm(a, b, c, transA, transB, alpha, beta, dtype)
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"pytorch time: {elapsed :6f}")
+    ans = gemm(a, b, c, transA, transB, alpha, beta, dtype)
 
     a_tensor = to_tensor(a, lib)
     b_tensor = to_tensor(b, lib)
@@ -130,22 +118,35 @@ def test(
     )
     workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
 
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        check_error(
-            lib.infiniopGEMM(
-                descriptor,
-                workspace_ptr,
-                workspace_size,
-                y_tensor.data,
-                a_tensor.data,
-                b_tensor.data,
-                c_tensor.data if c_tensor else None,
-                None,
-            )
+    check_error(
+        lib.infiniopGEMM(
+            descriptor,
+            workspace_ptr,
+            workspace_size,
+            y_tensor.data,
+            a_tensor.data,
+            b_tensor.data,
+            c_tensor.data if c_tensor else None,
+            None,
         )
+    )
+
+    assert torch.allclose(y, ans, atol=0, rtol=1e-2)
+
     if PROFILE:
+        # Profiling PyTorch implementation
+        for i in range(NUM_PRERUN):
+            _ = gemm(a, b, c, transA, transB, alpha, beta, dtype)
+        synchronize_device(torch_device)
         start_time = time.time()
         for i in range(NUM_ITERATIONS):
+            _ = gemm(a, b, c, transA, transB, alpha, beta, dtype)
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f" pytorch time: {elapsed * 1000 :6f} ms")
+
+        # Profiling C Operators implementation
+        for i in range(NUM_PRERUN):
             check_error(
                 lib.infiniopGEMM(
                     descriptor,
@@ -158,14 +159,29 @@ def test(
                     None,
                 )
             )
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopGEMM(
+                    descriptor,
+                    workspace_ptr,
+                    workspace_size,
+                    y_tensor.data,
+                    a_tensor.data,
+                    b_tensor.data,
+                    c_tensor.data if c_tensor else None,
+                    None,
+                )
+            )
+        synchronize_device(torch_device)
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"    lib time: {elapsed :6f}")
+        print(f"     lib time: {elapsed * 1000 :6f} ms")
 
-    assert torch.allclose(y, ans, atol=0, rtol=1e-2)
     check_error(lib.infiniopDestroyGEMMDescriptor(descriptor))
 
 
-def test_cpu(lib, test_cases):
+def test_cpu(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
     for (
@@ -182,12 +198,12 @@ def test_cpu(lib, test_cases):
         c_stride,
         y_stride,
     ) in test_cases:
-        test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
-        test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=tensor_dtype)
     destroy_handle(lib, handle)
 
 
-def test_cuda(lib, test_cases):
+def test_cuda(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
     for (
@@ -204,12 +220,12 @@ def test_cuda(lib, test_cases):
         c_stride,
         y_stride,
     ) in test_cases:
-        test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
-        test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=tensor_dtype)
     destroy_handle(lib, handle)
 
 
-def test_bang(lib, test_cases):
+def test_bang(lib, test_cases, tensor_dtypes):
     import torch_mlu
 
     device = DeviceEnum.DEVICE_BANG
@@ -229,8 +245,8 @@ def test_bang(lib, test_cases):
         c_stride,
         y_stride,
     ) in test_cases:
-        test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
-        test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=tensor_dtype)
 
     destroy_handle(lib, handle)
 
@@ -323,6 +339,9 @@ def test_bang(lib, test_cases):
             (4096, 1),
         ),
     ]
+    tensor_dtypes = [
+        torch.float16, torch.float32,
+    ]
     args = get_args()
     lib = open_lib()
 
@@ -363,12 +382,14 @@ def test_bang(lib, test_cases):
         infiniopGEMMDescriptor_t,
     ]
 
+    if args.profile:
+        PROFILE = True
     if args.cpu:
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     if args.cuda:
-        test_cuda(lib, test_cases)
+        test_cuda(lib, test_cases, tensor_dtypes)
     if args.bang:
-        test_bang(lib, test_cases)
+        test_bang(lib, test_cases, tensor_dtypes)
     if not (args.cpu or args.cuda or args.bang):
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/global_avg_pool.py b/operatorspy/tests/global_avg_pool.py
index 33f7b64d..b1907162 100644
--- a/operatorspy/tests/global_avg_pool.py
+++ b/operatorspy/tests/global_avg_pool.py
@@ -16,12 +16,9 @@
     check_error,
 )
 
-from operatorspy.tests.test_utils import get_args
-import torch, time
+from operatorspy.tests.test_utils import get_args, synchronize_device
+import torch
 
-# constant for control whether profile the pytorch and lib functions
-# NOTE: need to manually add synchronization function to the lib function,
-#       e.g., cudaDeviceSynchronize() for CUDA
 PROFILE = False
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
@@ -40,8 +37,6 @@ def inferShape(x):
 
 def globalAvgPool(x):
     y = torch.mean(x, dim=tuple(range(2, x.dim())), keepdim=True)
-    if PROFILE:
-        torch.cuda.synchronize()
     return y.view(*inferShape(x))
 
 
@@ -59,14 +54,7 @@ def test(
     x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
     y = torch.zeros(inferShape(x), dtype=tensor_dtype).to(torch_device)
 
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        ans = globalAvgPool(x)
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            _ = globalAvgPool(x)
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"pytorch time: {elapsed :6f}")
+    ans = globalAvgPool(x)
 
     x_tensor = to_tensor(x, lib)
     y_tensor = to_tensor(y, lib)
@@ -96,58 +84,77 @@ def test(
     )
     workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
 
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        check_error(
-            lib.infiniopGlobalAvgPool(
-                descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None
-            )
+    check_error(
+        lib.infiniopGlobalAvgPool(
+            descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None
         )
+    )
+
+    if (tensor_dtype == torch.float16):
+        assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+    else:
+        assert torch.allclose(y, ans, atol=1e-6, rtol=1e-4)
+
     if PROFILE:
+        # Profiling PyTorch implementation
+        for i in range(NUM_PRERUN):
+            _ = globalAvgPool(x)
+        synchronize_device(torch_device)
         start_time = time.time()
         for i in range(NUM_ITERATIONS):
+            _ = globalAvgPool(x)
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f" pytorch time: {elapsed * 1000 :6f} ms")
+
+        # Profiling C Operators implementation
+        for i in range(NUM_PRERUN):
             check_error(
                 lib.infiniopGlobalAvgPool(
-                    descriptor,
-                    workspace_ptr,
-                    workspaceSize,
-                    y_tensor.data,
-                    x_tensor.data,
-                    None,
+                    descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None
                 )
             )
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopGlobalAvgPool(
+                    descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None
+                )
+            )
+        synchronize_device(torch_device)
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"    lib time: {elapsed :6f}")
+        print(f"     lib time: {elapsed * 1000 :6f} ms")
 
-    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
     check_error(lib.infiniopDestroyGlobalAvgPoolDescriptor(descriptor))
 
 
-def test_cpu(lib, test_cases):
+def test_cpu(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
     for x_shape in test_cases:
-        test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float16)
-        test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float32)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "cpu", x_shape, tensor_dtype=tensor_dtype)
     destroy_handle(lib, handle)
 
 
-def test_cuda(lib, test_cases):
+def test_cuda(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
     for x_shape in test_cases:
-        test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float16)
-        test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float32)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "cuda", x_shape, tensor_dtype=tensor_dtype)
     destroy_handle(lib, handle)
 
 
-def test_bang(lib, test_cases):
+def test_bang(lib, test_cases, tensor_dtypes):
     import torch_mlu
 
     device = DeviceEnum.DEVICE_BANG
     handle = create_handle(lib, device)
     for x_shape in test_cases:
-        test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float16)
-        test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float32)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "mlu", x_shape, tensor_dtype=tensor_dtype)
     destroy_handle(lib, handle)
 
 
@@ -169,6 +176,9 @@ def test_bang(lib, test_cases):
         ((32, 256, 1, 112, 112)),
         ((32, 256, 112, 112)),
     ]
+    tensor_dtypes = [
+        torch.float16, torch.float32,
+    ]
     args = get_args()
     lib = open_lib()
     lib.infiniopCreateGlobalAvgPoolDescriptor.restype = c_int32
@@ -197,12 +207,14 @@ def test_bang(lib, test_cases):
         infiniopGlobalAvgPoolDescriptor_t,
     ]
 
+    if args.profile:
+        PROFILE = True
     if args.cpu:
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     if args.cuda:
-        test_cuda(lib, test_cases)
+        test_cuda(lib, test_cases, tensor_dtypes)
     if args.bang:
-        test_bang(lib, test_cases)
+        test_bang(lib, test_cases, tensor_dtypes)
     if not (args.cpu or args.cuda or args.bang):
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/max_pool.py b/operatorspy/tests/max_pool.py
index ffc0bb19..abd1eddb 100644
--- a/operatorspy/tests/max_pool.py
+++ b/operatorspy/tests/max_pool.py
@@ -16,13 +16,10 @@
     check_error,
 )
 
-from operatorspy.tests.test_utils import get_args
+from operatorspy.tests.test_utils import get_args, synchronize_device
 import torch
 from typing import Tuple
 
-# constant for control whether profile the pytorch and lib functions
-# NOTE: need to manually add synchronization function to the lib function,
-#       e.g., cudaDeviceSynchronize() for CUDA
 PROFILE = False
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
@@ -48,8 +45,6 @@ def pool(x, k, padding, stride, dilation = 1):
         return None
 
     ans = pooling_layers[ndim](k, stride=stride, padding=padding, dilation=dilation)(x)
-    if PROFILE:
-        torch.cuda.synchronize()
     return ans
 
 
@@ -89,14 +84,7 @@ def test(
     x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
     y = torch.rand(inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype).to(torch_device)
 
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        ans = pool(x, k_shape, padding, strides)
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            _ = pool(x, k_shape, padding, strides)
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"pytorch time: {elapsed :6f}")
+    ans = pool(x, k_shape, padding, strides)
 
     x_tensor = to_tensor(x, lib)
     y_tensor = to_tensor(y, lib)
@@ -126,18 +114,47 @@ def test(
     workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
     workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
 
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        check_error(
-            lib.infiniopMaxPool(
-                descriptor,
-                workspace_ptr,
-                workspaceSize,
-                y_tensor.data,
-                x_tensor.data,
-                None,
-            )
+    check_error(
+        lib.infiniopMaxPool(
+            descriptor,
+            workspace_ptr,
+            workspaceSize,
+            y_tensor.data,
+            x_tensor.data,
+            None,
         )
+    )
+
+    if (tensor_dtype == torch.float16):
+        assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+    else:
+        assert torch.allclose(y, ans, atol=1e-6, rtol=1e-5)
+
     if PROFILE:
+        # Profiling PyTorch implementation
+        for i in range(NUM_PRERUN):
+            _ = pool(x, k_shape, padding, strides)
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = pool(x, k_shape, padding, strides)
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f" pytorch time: {elapsed * 1000 :6f} ms")
+
+        # Profiling C Operators implementation
+        for i in range(NUM_PRERUN):
+            check_error(
+                lib.infiniopMaxPool(
+                    descriptor,
+                    workspace_ptr,
+                    workspaceSize,
+                    y_tensor.data,
+                    x_tensor.data,
+                    None,
+                )
+            )
+        synchronize_device(torch_device)
         start_time = time.time()
         for i in range(NUM_ITERATIONS):
             check_error(
@@ -150,39 +167,39 @@ def test(
                     None,
                 )
             )
+        synchronize_device(torch_device)
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"    lib time: {elapsed :6f}")
+        print(f"     lib time: {elapsed * 1000 :6f} ms")
 
-    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
     check_error(lib.infiniopDestroyMaxPoolDescriptor(descriptor))
 
 
-def test_cpu(lib, test_cases):
+def test_cpu(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
     for x_shape, kernel_shape, padding, strides in test_cases:
-        test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
-        test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=tensor_dtype)
     destroy_handle(lib, handle)
 
 
-def test_cuda(lib, test_cases):
+def test_cuda(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
     for x_shape, kernel_shape, padding, strides in test_cases:
-        test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
-        test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=tensor_dtype)
     destroy_handle(lib, handle)
 
 
-def test_bang(lib, test_cases):
+def test_bang(lib, test_cases, tensor_dtypes):
     import torch_mlu
 
     device = DeviceEnum.DEVICE_BANG
     handle = create_handle(lib, device)
     for x_shape, kernel_shape, padding, strides in test_cases:
-        test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
-        test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=tensor_dtype)
     destroy_handle(lib, handle)
 
 
@@ -193,6 +210,9 @@ def test_bang(lib, test_cases):
         ((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)),
         ((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)),
     ]
+    tensor_dtypes = [
+        torch.float16, torch.float32,
+    ]
     args = get_args()
     lib = open_lib()
     lib.infiniopCreateMaxPoolDescriptor.restype = c_int32
@@ -225,12 +245,14 @@ def test_bang(lib, test_cases):
         infiniopMaxPoolDescriptor_t,
     ]
 
+    if args.profile:
+        PROFILE = True
     if args.cpu:
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     if args.cuda:
-        test_cuda(lib, test_cases)
+        test_cuda(lib, test_cases, tensor_dtypes)
     if args.bang:
-        test_bang(lib, test_cases)
+        test_bang(lib, test_cases, tensor_dtypes)
     if not (args.cpu or args.cuda or args.bang):
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/mlp.py b/operatorspy/tests/mlp.py
index 668d7861..b4bdfcb8 100644
--- a/operatorspy/tests/mlp.py
+++ b/operatorspy/tests/mlp.py
@@ -2,6 +2,7 @@
 import ctypes
 import sys
 import os
+import time
 
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
 from operatorspy import (
@@ -18,10 +19,13 @@
     create_workspace,
 )
 
-from operatorspy.tests.test_utils import get_args
+from operatorspy.tests.test_utils import get_args, synchronize_device
 import torch
 import torch.nn as nn
 
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
 
 class MLPDescriptor(Structure):
     _fields_ = [("device", c_int32)]
@@ -137,10 +141,55 @@ def test(
     )
     assert torch.allclose(y, ans, atol=0, rtol=2e-2)
 
+    if PROFILE:
+        # Profiling PyTorch implementation
+        for i in range(NUM_PRERUN):
+            _ = mlp(y, x, w12, w3, alpha, residual)
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = mlp(y, x, w12, w3, alpha, residual)
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f" pytorch time: {elapsed * 1000 :6f} ms")
+
+        # Profiling C Operators implementation
+        for i in range(NUM_PRERUN):
+            check_error(
+                lib.infiniopMLP(
+                    descriptor,
+                    workspace.data_ptr() if workspace is not None else None,
+                    workspace_size.value,
+                    y_tensor.data,
+                    x_tensor.data,
+                    w12_tensor.data,
+                    w3_tensor.data,
+                    None,
+                )
+            )
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopMLP(
+                    descriptor,
+                    workspace.data_ptr() if workspace is not None else None,
+                    workspace_size.value,
+                    y_tensor.data,
+                    x_tensor.data,
+                    w12_tensor.data,
+                    w3_tensor.data,
+                    None,
+                )
+            )
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"     lib time: {elapsed * 1000 :6f} ms")
+
     check_error(lib.infiniopDestroyMLPDescriptor(descriptor))
 
 
-def test_cpu(lib, test_cases):
+def test_cpu(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
 
@@ -150,32 +199,32 @@ def test_cpu(lib, test_cases):
         intermediate_size,
         alpha,
         residual,
-        dtype,
         x_stride,
         y_stride,
         w12_stride,
         w3_stride,
     ) in test_cases:
-        test(
-            lib,
-            handle,
-            "cpu",
-            num_tokens,
-            hidden_size,
-            intermediate_size,
-            alpha,
-            residual,
-            dtype,
-            x_stride,
-            y_stride,
-            w12_stride,
-            w3_stride,
-        )
+        for tensor_dtype in tensor_dtypes:
+            test(
+                lib,
+                handle,
+                "cpu",
+                num_tokens,
+                hidden_size,
+                intermediate_size,
+                alpha,
+                residual,
+                tensor_dtype,
+                x_stride,
+                y_stride,
+                w12_stride,
+                w3_stride,
+            )
 
     destroy_handle(lib, handle)
 
 
-def test_cuda(lib, test_cases):
+def test_cuda(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
 
@@ -185,32 +234,32 @@ def test_cuda(lib, test_cases):
         intermediate_size,
         alpha,
         residual,
-        dtype,
         x_stride,
         y_stride,
         w12_stride,
         w3_stride,
     ) in test_cases:
-        test(
-            lib,
-            handle,
-            "cuda",
-            num_tokens,
-            hidden_size,
-            intermediate_size,
-            alpha,
-            residual,
-            dtype,
-            x_stride,
-            y_stride,
-            w12_stride,
-            w3_stride,
-        )
+        for tensor_dtype in tensor_dtypes:
+            test(
+                lib,
+                handle,
+                "cuda",
+                num_tokens,
+                hidden_size,
+                intermediate_size,
+                alpha,
+                residual,
+                tensor_dtype,
+                x_stride,
+                y_stride,
+                w12_stride,
+                w3_stride,
+            )
 
     destroy_handle(lib, handle)
 
 
-def test_bang(lib, test_cases):
+def test_bang(lib, test_cases, tensor_dtypes):
     import torch_mlu
 
     device = DeviceEnum.DEVICE_BANG
@@ -222,27 +271,27 @@ def test_bang(lib, test_cases):
         intermediate_size,
         alpha,
         residual,
-        dtype,
         x_stride,
         y_stride,
         w12_stride,
         w3_stride,
     ) in test_cases:
-        test(
-            lib,
-            handle,
-            "mlu",
-            num_tokens,
-            hidden_size,
-            intermediate_size,
-            alpha,
-            residual,
-            dtype,
-            x_stride,
-            y_stride,
-            w12_stride,
-            w3_stride,
-        )
+        for tensor_dtype in tensor_dtypes:
+            test(
+                lib,
+                handle,
+                "mlu",
+                num_tokens,
+                hidden_size,
+                intermediate_size,
+                alpha,
+                residual,
+                tensor_dtype,
+                x_stride,
+                y_stride,
+                w12_stride,
+                w3_stride,
+            )
 
     destroy_handle(lib, handle)
 
@@ -250,22 +299,24 @@ def test_bang(lib, test_cases):
 if __name__ == "__main__":
     test_cases = [
         # num_tokens, hidden_size, intermediate_size, alpha, residual, dtype, x_stride, y_stride, w12_stride, w3_stride
-        (4, 4096, 11008, 1.0, True, torch.float16, None, None, None, None),
-        (4, 4096, 11008, 1.0, True, torch.float16, [8192, 1], [8192, 1], None, None),
+        (4, 4096, 11008, 1.0, True, None, None, None, None),
+        (4, 4096, 11008, 1.0, True, [8192, 1], [8192, 1], None, None),
         (
             4,
             4096,
             11008,
             1.0,
             True,
-            torch.float16,
             None,
             None,
             [1, 4096],
             [1, 11008],
         ),
-        (4, 4096, 11008, 1.0, False, torch.float16, None, None, None, None),
-        (4, 4096, 11008, 1.0, False, torch.float16, [8192, 1], [8192, 1], None, None),
+        (4, 4096, 11008, 1.0, False, None, None, None, None),
+        (4, 4096, 11008, 1.0, False, [8192, 1], [8192, 1], None, None),
+    ]
+    tensor_dtypes = [
+        torch.float16,
     ]
     args = get_args()
     lib = open_lib()
@@ -305,12 +356,14 @@ def test_bang(lib, test_cases):
         infiniopMLPDescriptor_t,
     ]
 
+    if args.profile:
+        PROFILE = True
     if args.cpu:
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     if args.cuda:
-        test_cuda(lib, test_cases)
+        test_cuda(lib, test_cases, tensor_dtypes)
     if args.bang:
-        test_bang(lib, test_cases)
+        test_bang(lib, test_cases, tensor_dtypes)
     if not (args.cpu or args.cuda or args.bang):
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/random_sample.py b/operatorspy/tests/random_sample.py
index 98a8dceb..8693d807 100644
--- a/operatorspy/tests/random_sample.py
+++ b/operatorspy/tests/random_sample.py
@@ -2,6 +2,7 @@
 import ctypes
 import sys
 import os
+import time
 
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
 from operatorspy import (
@@ -18,9 +19,12 @@
     U64,
 )
 
-from operatorspy.tests.test_utils import get_args
+from operatorspy.tests.test_utils import get_args, synchronize_device
 import torch
 
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
 
 class RandomSampleDescriptor(Structure):
     _fields_ = [("device", c_int32)]
@@ -129,40 +133,100 @@ def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_
         torch.npu.synchronize()
 
     assert indices[0].type(ans.dtype) == ans or data[ans] == data[indices[0]]
+
+    if PROFILE:
+        # Profiling PyTorch implementation
+        for i in range(NUM_PRERUN):
+            if(topp > 0 and topk > 1):
+                _ = random_sample(data.to("cpu"), random_val, topp, topk, voc, temperature, "cpu")
+            else:
+                _ = random_sample_0(data)
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            if(topp > 0 and topk > 1):
+                _ = random_sample(data.to("cpu"), random_val, topp, topk, voc, temperature, "cpu")
+            else:
+                _ = random_sample_0(data)
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f" pytorch time: {elapsed * 1000 :6f} ms")
+
+        # Profiling C Operators implementation
+        for i in range(NUM_PRERUN):
+            check_error(
+                lib.infiniopRandomSample(
+                    descriptor,
+                    workspace.data_ptr() if workspace is not None else None,
+                    workspace_size.value,
+                    indices_tensor.data,
+                    x_tensor.data,
+                    random_val,
+                    topp,
+                    topk,
+                    temperature,
+                    None,
+                )
+            )
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopRandomSample(
+                    descriptor,
+                    workspace.data_ptr() if workspace is not None else None,
+                    workspace_size.value,
+                    indices_tensor.data,
+                    x_tensor.data,
+                    random_val,
+                    topp,
+                    topk,
+                    temperature,
+                    None,
+                )
+            )
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"     lib time: {elapsed * 1000 :6f} ms")
+
     check_error(lib.infiniopDestroyRandomSampleDescriptor(descriptor))
 
-def test_cpu(lib, test_cases):
+def test_cpu(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
     for (voc, random_val, topp, topk, temperature) in test_cases:
-        test(lib, handle, "cpu", voc, random_val, topp, topk, temperature)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "cpu", voc, random_val, topp, topk, temperature, x_dtype=tensor_dtype)
     destroy_handle(lib, handle)
 
 
-def test_cuda(lib, test_cases):
+def test_cuda(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
     for (voc, random_val, topp, topk, temperature) in test_cases:
-        test(lib, handle, "cuda", voc, random_val, topp, topk, temperature)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "cuda", voc, random_val, topp, topk, temperature, x_dtype=tensor_dtype)
     destroy_handle(lib, handle)
 
 
-def test_bang(lib, test_cases):
+def test_bang(lib, test_cases, tensor_dtypes):
     import torch_mlu
 
     device = DeviceEnum.DEVICE_BANG
     handle = create_handle(lib, device)
     for (voc, random_val, topp, topk, temperature) in test_cases:
-        test(lib, handle, "mlu", voc, random_val, topp, topk, temperature)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "mlu", voc, random_val, topp, topk, temperature, x_dtype=tensor_dtype)
     destroy_handle(lib, handle)
 
 
-def test_ascend(lib, test_cases):
+def test_ascend(lib, test_cases, tensor_dtypes):
     import torch_npu
     device = DeviceEnum.DEVICE_ASCEND
     handle = create_handle(lib, device)
     for (voc, random_val, topp, topk, temperature) in test_cases:
-        test(lib, handle, "npu", voc, random_val, topp, topk, temperature)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "npu", voc, random_val, topp, topk, temperature, x_dtype=tensor_dtype)
     destroy_handle(lib, handle)
 
 
@@ -180,6 +244,9 @@ def test_ascend(lib, test_cases):
         (32000, 0.08, 1.0, 25, 1.0),
         # (119696, 0.01, 1.0, 100, 1.0),
     ]
+    tensor_dtypes = [
+        torch.float16,
+    ]
     
     args = get_args()
     lib = open_lib()
@@ -212,14 +279,16 @@ def test_ascend(lib, test_cases):
         infiniopRandomSampleDescriptor_t,
     ]
 
+    if args.profile:
+        PROFILE = True
     if args.cpu:
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     if args.cuda:
-        test_cuda(lib, test_cases)
+        test_cuda(lib, test_cases, tensor_dtypes)
     if args.bang:
-        test_bang(lib, test_cases)
+        test_bang(lib, test_cases, tensor_dtypes)
     if args.ascend:
-        test_ascend(lib, test_cases)
+        test_ascend(lib, test_cases, tensor_dtypes)
     if not (args.cpu or args.cuda or args.bang or args.ascend):
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/rearrange.py b/operatorspy/tests/rearrange.py
index e9cc81b9..8902633c 100644
--- a/operatorspy/tests/rearrange.py
+++ b/operatorspy/tests/rearrange.py
@@ -2,6 +2,7 @@
 from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
 import sys
 import os
+import time
 
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
 from operatorspy import (
@@ -17,9 +18,12 @@
     rearrange_tensor,
 )
 
-from operatorspy.tests.test_utils import get_args
+from operatorspy.tests.test_utils import get_args, synchronize_device
 import torch
 
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
 
 class RerrangeDescriptor(Structure):
     _fields_ = [("device", c_int32)]
@@ -28,6 +32,12 @@ class RerrangeDescriptor(Structure):
 infiniopRearrangeDescriptor_t = POINTER(RerrangeDescriptor)
 
 
+def rearrange(x, x_shape, y_strides):
+    y_ = x.clone()
+    y_.set_(y_.untyped_storage(), 0, x_shape, y_strides)
+    y_[:] = x.view_as(y_)
+    return y_
+
 def test(
     lib,
     handle,
@@ -47,6 +57,9 @@ def test(
         x = rearrange_tensor(x, x_stride)
     if y_stride is not None:
         y = rearrange_tensor(y, y_stride)
+
+    ans = rearrange(x, x_shape, y_stride)
+
     x_tensor = to_tensor(x, lib)
     y_tensor = to_tensor(y, lib)
 
@@ -64,40 +77,72 @@ def test(
     check_error(
         lib.infiniopRearrange(descriptor, y_tensor.data, x_tensor.data, None)
     )
-    assert torch.allclose(x, y, atol=0, rtol=1e-3)
+
+    assert torch.allclose(x, ans, atol=0, rtol=1e-3)
+
+    if PROFILE:
+        # Profiling PyTorch implementation
+        for i in range(NUM_PRERUN):
+            _ = rearrange(x, x_shape, y_stride)
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = rearrange(x, x_shape, y_stride)
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f" pytorch time: {elapsed * 1000 :6f} ms")
+
+        # Profiling C Operators implementation
+        for i in range(NUM_PRERUN):
+            check_error(
+                lib.infiniopRearrange(descriptor, y_tensor.data, x_tensor.data, None)
+            )
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopRearrange(descriptor, y_tensor.data, x_tensor.data, None)
+            )
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"     lib time: {elapsed * 1000 :6f} ms")
+
     check_error(lib.infiniopDestroyRearrangeDescriptor(descriptor))
 
 
-def test_cpu(lib, test_cases):
+def test_cpu(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
     for test_case in test_cases:
         x_shape, x_stride = test_case[0]
         y_shape, y_stride = test_case[1]
-        test(lib, handle, "cpu", x_shape, x_stride, y_shape, y_stride)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "cpu", x_shape, x_stride, y_shape, y_stride, tensor_dtype)
     destroy_handle(lib, handle)
 
 
-def test_cuda(lib, test_cases):
+def test_cuda(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
     for test_case in test_cases:
         x_shape, x_stride = test_case[0]
         y_shape, y_stride = test_case[1]
-        test(lib, handle, "cuda", x_shape, x_stride, y_shape, y_stride)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "cuda", x_shape, x_stride, y_shape, y_stride, tensor_dtype)
     destroy_handle(lib, handle)
 
-def test_bang(lib, test_cases):
+def test_bang(lib, test_cases, tensor_dtypes):
     import torch_mlu
     device = DeviceEnum.DEVICE_BANG
     handle = create_handle(lib, device)
     for test_case in test_cases:
         x_shape, x_stride = test_case[0]
         y_shape, y_stride = test_case[1]
-        test(lib, handle, "mlu", x_shape, x_stride, y_shape, y_stride)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "mlu", x_shape, x_stride, y_shape, y_stride, tensor_dtype)
     destroy_handle(lib, handle)
 
-def test_ascend(lib, test_cases):
+def test_ascend(lib, test_cases, tensor_dtypes):
     import torch_npu
 
     device = DeviceEnum.DEVICE_ASCEND
@@ -105,7 +150,8 @@ def test_ascend(lib, test_cases):
     for test_case in test_cases:
         x_shape, x_stride = test_case[0]
         y_shape, y_stride = test_case[1]
-        test(lib, handle, "npu", x_shape, x_stride, y_shape, y_stride)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "npu", x_shape, x_stride, y_shape, y_stride, tensor_dtype)
     destroy_handle(lib, handle) 
 
 if __name__ == "__main__":
@@ -120,6 +166,9 @@ def test_ascend(lib, test_cases):
         (((4, 1, 64), (64, 2560, 1)), ((4, 1, 64), (64, 11264, 1))),
         (((64,), (1,)), ((64,), (1,))),
         ]
+    tensor_dtypes = [
+        torch.float16,
+    ]
     lib = open_lib()
     lib.infiniopCreateRearrangeDescriptor.restype = c_int32
     lib.infiniopCreateRearrangeDescriptor.argtypes = [
@@ -137,12 +186,15 @@ def test_ascend(lib, test_cases):
     ]
     lib.infiniopDestroyRearrangeDescriptor.restype = c_int32
     lib.infiniopDestroyRearrangeDescriptor.argtypes = [infiniopRearrangeDescriptor_t]
+
+    if args.profile:
+        PROFILE = True
     if args.cpu:
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     if args.cuda:
-        test_cuda(lib, test_cases)
+        test_cuda(lib, test_cases, tensor_dtypes)
     if args.bang:
-        test_bang(lib, test_cases)
+        test_bang(lib, test_cases, tensor_dtypes)
     if args.ascend:
-        test_ascend(lib, test_cases)
+        test_ascend(lib, test_cases, tensor_dtypes)
     print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/relu.py b/operatorspy/tests/relu.py
index b7f76627..5940468f 100644
--- a/operatorspy/tests/relu.py
+++ b/operatorspy/tests/relu.py
@@ -16,13 +16,10 @@
     check_error,
 )
 
-from operatorspy.tests.test_utils import get_args
+from operatorspy.tests.test_utils import get_args, synchronize_device
 from enum import Enum, auto
 import torch
 
-# constant for control whether profile the pytorch and lib functions
-# NOTE: need to manually add synchronization function to the lib function,
-#       e.g., cudaDeviceSynchronize() for CUDA
 PROFILE = False
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
@@ -41,10 +38,6 @@ class ReluDescriptor(Structure):
 
 
 def relu(x):
-    if PROFILE:
-        ans = torch.nn.functional.relu(x).to(x.dtype)
-        torch.cuda.synchronize()
-        return ans
     return torch.nn.functional.relu(x).to(x.dtype)
 
 
@@ -63,14 +56,7 @@ def test(
     x = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1
     y = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else x
 
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        ans = relu(x)
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            _ = relu(x)
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"pytorch time: {elapsed :6f}")
+    ans = relu(x)
 
     x_tensor = to_tensor(x, lib)
     y_tensor = to_tensor(y, lib) if inplace == Inplace.OUT_OF_PLACE else x_tensor
@@ -89,47 +75,62 @@ def test(
     x_tensor.descriptor.contents.invalidate()
     y_tensor.descriptor.contents.invalidate()
 
-    for i in range(NUM_PRERUN if PROFILE else 1):
-        check_error(lib.infiniopRelu(descriptor, y_tensor.data, x_tensor.data, None))
+    check_error(lib.infiniopRelu(descriptor, y_tensor.data, x_tensor.data, None))
+
+    assert torch.allclose(y, ans, atol=0, rtol=0)
+
     if PROFILE:
+        # Profiling PyTorch implementation
+        for i in range(NUM_PRERUN):
+            _ = relu(x)
+        synchronize_device(torch_device)
         start_time = time.time()
         for i in range(NUM_ITERATIONS):
-            check_error(
-                lib.infiniopRelu(descriptor, y_tensor.data, x_tensor.data, None)
-            )
+            _ = relu(x)
+        synchronize_device(torch_device)
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"    lib time: {elapsed :6f}")
+        print(f" pytorch time: {elapsed * 1000 :6f} ms")
+
+        # Profiling C Operators implementation
+        for i in range(NUM_PRERUN):
+            check_error(lib.infiniopRelu(descriptor, y_tensor.data, x_tensor.data, None))
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(lib.infiniopRelu(descriptor, y_tensor.data, x_tensor.data, None))
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"     lib time: {elapsed * 1000 :6f} ms")
 
-    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
     check_error(lib.infiniopDestroyReluDescriptor(descriptor))
 
 
-def test_cpu(lib, test_cases):
+def test_cpu(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
     for tensor_shape, inplace in test_cases:
-        test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
-        test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "cpu", tensor_shape, tensor_dtype=tensor_dtype, inplace=inplace)
     destroy_handle(lib, handle)
 
 
-def test_cuda(lib, test_cases):
+def test_cuda(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
     for tensor_shape, inplace in test_cases:
-        test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
-        test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "cuda", tensor_shape, tensor_dtype=tensor_dtype, inplace=inplace)
     destroy_handle(lib, handle)
 
 
-def test_bang(lib, test_cases):
+def test_bang(lib, test_cases, tensor_dtypes):
     import torch_mlu
 
     device = DeviceEnum.DEVICE_BANG
     handle = create_handle(lib, device)
     for tensor_shape, inplace in test_cases:
-        test(lib, handle, "mlu", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
-        test(lib, handle, "mlu", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "mlu", tensor_shape, tensor_dtype=tensor_dtype, inplace=inplace)
     destroy_handle(lib, handle)
 
 
@@ -145,6 +146,9 @@ def test_bang(lib, test_cases):
         ((33, 333, 333), Inplace.OUT_OF_PLACE),
         ((32, 256, 112, 112), Inplace.OUT_OF_PLACE),
     ]
+    tensor_dtypes = [
+        torch.float16, torch.float32,
+    ]
     args = get_args()
     lib = open_lib()
     lib.infiniopCreateReluDescriptor.restype = c_int32
@@ -166,12 +170,14 @@ def test_bang(lib, test_cases):
         infiniopReluDescriptor_t,
     ]
 
+    if args.profile:
+        PROFILE = True
     if args.cpu:
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     if args.cuda:
-        test_cuda(lib, test_cases)
+        test_cuda(lib, test_cases, tensor_dtypes)
     if args.bang:
-        test_bang(lib, test_cases)
+        test_bang(lib, test_cases, tensor_dtypes)
     if not (args.cpu or args.cuda or args.bang):
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/rms_norm.py b/operatorspy/tests/rms_norm.py
index 13cf1ccf..3845e7be 100644
--- a/operatorspy/tests/rms_norm.py
+++ b/operatorspy/tests/rms_norm.py
@@ -2,6 +2,7 @@
 import ctypes
 import sys
 import os
+import time
 
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
 from operatorspy import (
@@ -17,9 +18,13 @@
     create_workspace,
 )
 
-from operatorspy.tests.test_utils import get_args
+from operatorspy.tests.test_utils import get_args, synchronize_device
 import torch
 
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
 class RMSNormDescriptor(Structure):
     _fields_ = [("device", c_int32)]
 
@@ -84,44 +89,102 @@ def test(lib, handle, torch_device, y_shape, x_shape, w_shape, dtype=torch.float
     )
 
     assert torch.allclose(y.to(dtype), ans.to(dtype), atol=1e-3, rtol=1e-3)
+
+    if PROFILE:
+        # Profiling PyTorch implementation
+        for i in range(NUM_PRERUN):
+            _ = rms_norm(x, w, eps)
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = rms_norm(x, w, eps)
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f" pytorch time: {elapsed * 1000 :6f} ms")
+
+        # Profiling C Operators implementation
+        for i in range(NUM_PRERUN):
+            check_error(
+                lib.infiniopRMSNorm(
+                    descriptor,
+                    workspace.data_ptr() if workspace is not None else None,
+                    workspace_size.value,
+                    y_tensor.data,
+                    x_tensor.data,
+                    w_tensor.data,
+                    None,
+                )
+            )
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopRMSNorm(
+                    descriptor,
+                    workspace.data_ptr() if workspace is not None else None,
+                    workspace_size.value,
+                    y_tensor.data,
+                    x_tensor.data,
+                    w_tensor.data,
+                    None,
+                )
+            )
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"     lib time: {elapsed * 1000 :6f} ms")
+
     check_error(lib.infiniopDestroyRMSNormDescriptor(descriptor))
 
-def test_cpu(lib, test_cases):
+def test_cpu(lib, test_cases, tensor_dtypes, w_dtypes):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
-    for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases:
-        test(lib, handle, "cpu", y_shape, x_shape, w_shape, dtype, w_dtype)
+    for (y_shape, x_shape, w_shape) in test_cases:
+        for tensor_dtype in tensor_dtypes:
+            for w_dtype in w_dtypes:
+                test(lib, handle, "cpu", y_shape, x_shape, w_shape, tensor_dtype, w_dtype)
     destroy_handle(lib, handle)
 
-def test_cuda(lib, test_cases):
+def test_cuda(lib, test_cases, tensor_dtypes, w_dtypes):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
-    for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases:
-        test(lib, handle, "cuda", y_shape, x_shape, w_shape, dtype, w_dtype)
+    for (y_shape, x_shape, w_shape) in test_cases:
+        for tensor_dtype in tensor_dtypes:
+            for w_dtype in w_dtypes:
+                test(lib, handle, "cuda", y_shape, x_shape, w_shape, tensor_dtype, w_dtype)
     destroy_handle(lib, handle)
 
-def test_bang(lib, test_cases):
+def test_bang(lib, test_cases, tensor_dtypes, w_dtypes):
     import torch_mlu
     device = DeviceEnum.DEVICE_BANG
     handle = create_handle(lib, device)
-    for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases:
-        test(lib, handle, "mlu", y_shape, x_shape, w_shape, dtype, w_dtype)
+    for (y_shape, x_shape, w_shape) in test_cases:
+        for tensor_dtype in tensor_dtypes:
+            for w_dtype in w_dtypes:
+                test(lib, handle, "mlu", y_shape, x_shape, w_shape, tensor_dtype, w_dtype)
     destroy_handle(lib, handle)
 
-def test_ascend(lib, test_cases):
+def test_ascend(lib, test_cases, tensor_dtypes, w_dtypes):
     import torch_npu
     device = DeviceEnum.DEVICE_ASCEND
     handle = create_handle(lib, device)
-    for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases:
-        test(lib, handle, "npu", y_shape, x_shape, w_shape, dtype, w_dtype)
+    for (y_shape, x_shape, w_shape) in test_cases:
+        for tensor_dtype in tensor_dtypes:
+            for w_dtype in w_dtypes:
+                test(lib, handle, "npu", y_shape, x_shape, w_shape, tensor_dtype, w_dtype)
 
     destroy_handle(lib, handle)
 
 if __name__ == "__main__":
     test_cases = [
-        # y_shape, x_shape, w_shape, dtype, w_dtype
-        ((16, 2048), (16, 2048), (2048,), torch.float16, torch.float16),
-        ((16, 2048), (16, 2048), (2048,), torch.float16, torch.float32),
+        # y_shape, x_shape, w_shape, 
+        ((16, 2048), (16, 2048), (2048,)),
+        ((32, 4096), (32, 4096), (4096,)),
+    ]
+    tensor_dtypes = [
+        torch.float16,
+    ]
+    w_dtypes = [
+        torch.float16, torch.float32,
     ]
     args = get_args()
     lib = open_lib()
@@ -156,14 +219,16 @@ def test_ascend(lib, test_cases):
         infiniopRMSNormDescriptor_t,
     ]
 
+    if args.profile:
+        PROFILE = True
     if args.cpu:
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes, w_dtypes)
     if args.cuda:
-        test_cuda(lib, test_cases)
+        test_cuda(lib, test_cases, tensor_dtypes, w_dtypes)
     if args.bang:
-        test_bang(lib, test_cases)
+        test_bang(lib, test_cases, tensor_dtypes, w_dtypes)
     if args.ascend:
-        test_ascend(lib, test_cases)
+        test_ascend(lib, test_cases, tensor_dtypes, w_dtypes)
     if not (args.cpu or args.cuda or args.bang or args.ascend):
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes, w_dtypes)
     print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/rotary_embedding.py b/operatorspy/tests/rotary_embedding.py
index 081d2f91..1442ad04 100644
--- a/operatorspy/tests/rotary_embedding.py
+++ b/operatorspy/tests/rotary_embedding.py
@@ -2,7 +2,7 @@
 from ctypes import c_float, POINTER, c_void_p, c_int32, c_uint64, Structure, byref
 import sys
 import os
-
+import time
 
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
 from operatorspy import (
@@ -19,9 +19,12 @@
     U64,
 )
 
-from operatorspy.tests.test_utils import get_args
+from operatorspy.tests.test_utils import get_args, synchronize_device
 import torch
 
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
 
 class RoPEDescriptor(Structure):
     _fields_ = [("device", c_int32)]
@@ -135,53 +138,116 @@ def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16):
     )
 
     assert torch.allclose(t, ans, atol=1e-4, rtol=1e-2)
+
+    if PROFILE:
+        # Profiling PyTorch implementation
+        posTmp_d = posTmp.to(torch_device)
+
+        for i in range(NUM_PRERUN):
+            if torch_device == 'mlu' or torch_device == 'npu':
+                _ = rotary_embedding(t, posTmp, theta, "cpu").to(torch_device)
+            else:
+                _ = rotary_embedding(t, posTmp_d, theta, torch_device)
+        synchronize_device(torch_device)
+
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            if torch_device == 'mlu' or torch_device == 'npu':
+                _ = rotary_embedding(t, posTmp, theta, "cpu").to(torch_device)
+            else:
+                _ = rotary_embedding(t, posTmp_d, theta, torch_device)
+        synchronize_device(torch_device)
+        
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f" pytorch time: {elapsed * 1000 :6f} ms")
+
+        # Profiling C Operators implementation
+        for i in range(NUM_PRERUN):
+            check_error(
+                lib.infiniopRoPE(
+                    descriptor,
+                    workspace.data_ptr() if workspace is not None else None,
+                    workspace_size.value,
+                    t_tensor.data,
+                    pos_tensor.data,
+                    sin_table_tensor.data,
+                    cos_table_tensor.data,
+                    None,
+                )
+            )
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopRoPE(
+                    descriptor,
+                    workspace.data_ptr() if workspace is not None else None,
+                    workspace_size.value,
+                    t_tensor.data,
+                    pos_tensor.data,
+                    sin_table_tensor.data,
+                    cos_table_tensor.data,
+                    None,
+                )
+            )
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"     lib time: {elapsed * 1000 :6f} ms")
+
     check_error(lib.infiniopDestroyRoPEDescriptor(descriptor))
 
 
-def test_cpu(lib, test_cases):
+def test_cpu(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
-    for shape, strides, dtype in test_cases:
-        test(lib, handle, "cpu", shape, strides, dtype)
+    for shape, strides in test_cases:
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "cpu", shape, strides, tensor_dtype)
     destroy_handle(lib, handle)
 
 
-def test_cuda(lib, test_cases):
+def test_cuda(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
-    for shape, strides, dtype in test_cases:
-        test(lib, handle, "cuda", shape, strides, dtype)
+    for shape, strides in test_cases:
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "cuda", shape, strides, tensor_dtype)
     destroy_handle(lib, handle)
 
 
-def test_bang(lib, test_cases):
+def test_bang(lib, test_cases, tensor_dtypes):
     import torch_mlu
     device = DeviceEnum.DEVICE_BANG
     handle = create_handle(lib, device)
-    for shape, strides, dtype in test_cases:
-        test(lib, handle, "mlu", shape, strides, dtype)
+    for shape, strides in test_cases:
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "mlu", shape, strides, tensor_dtype)
     destroy_handle(lib, handle)
 
 
-def test_ascend(lib, test_cases) :
+def test_ascend(lib, test_cases, tensor_dtypes) :
     import torch_npu
 
     device = DeviceEnum.DEVICE_ASCEND
     handle = create_handle(lib, device)
-    for shape, strides, dtype in test_cases:
-        test(lib, handle, "npu", shape, strides, dtype)
+    for shape, strides in test_cases:
+        for tensor_dtype in tensor_dtypes:
+            test(lib, handle, "npu", shape, strides, tensor_dtype)
     destroy_handle(lib, handle)
 
 if __name__ == "__main__":
     test_cases = [
-        ((1, 32, 128), None, torch.float16),
-        ((1, 32, 64), None, torch.float16),
+        ((1, 32, 128), None),
+        ((1, 32, 64), None),
         # 昇腾暂不满足这个用例，最后一维度 <=32 会有问题，可能与其核心
         # 接口 GatherMask 的内部实现相关，目前 48 64 128 都可以支持
-        ((4, 1, 32), None, torch.float16),
-        ((1, 32, 128), None, torch.float16),
+        ((4, 1, 32), None),
+        ((1, 32, 128), None),
         
-        ((3, 32, 128), (8000, 200, 1), torch.float16),
+        ((3, 32, 128), (8000, 200, 1)),
+    ]
+    tensor_dtypes = [
+        torch.float16, 
     ]
     args = get_args()
     lib = open_lib()
@@ -214,14 +280,17 @@ def test_ascend(lib, test_cases) :
     lib.infiniopDestroyRoPEDescriptor.argtypes = [
         infiniopRoPEDescriptor_t,
     ]
+
+    if args.profile:
+        PROFILE = True
     if args.cpu:
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     if args.cuda:
-        test_cuda(lib, test_cases)
+        test_cuda(lib, test_cases, tensor_dtypes)
     if args.bang:
-        test_bang(lib, test_cases)
+        test_bang(lib, test_cases, tensor_dtypes)
     if args.ascend:
-        test_ascend(lib, test_cases)
+        test_ascend(lib, test_cases, tensor_dtypes)
     if not (args.cpu or args.cuda or args.bang or args.ascend):
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/swiglu.py b/operatorspy/tests/swiglu.py
index 7fb447a1..a8772630 100644
--- a/operatorspy/tests/swiglu.py
+++ b/operatorspy/tests/swiglu.py
@@ -2,6 +2,7 @@
 import ctypes
 import sys
 import os
+import time
 
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
 from operatorspy import (
@@ -17,9 +18,20 @@
     rearrange_tensor,
 )
 
-from operatorspy.tests.test_utils import get_args
+from operatorspy.tests.test_utils import get_args, synchronize_device
+from enum import Enum, auto
 import torch
 
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
 
 class SwiGLUDescriptor(Structure):
     _fields_ = [("device", c_int32)]
@@ -28,138 +40,72 @@ class SwiGLUDescriptor(Structure):
 infiniopSwiGLUDescriptor_t = POINTER(SwiGLUDescriptor)
 
 
-def swiglu(a, b):
-    
-    return a * b / (1 + torch.exp(-b.float()).to(b.dtype))
+def print_discrepancy(actual, expected, atol=0, rtol=1e-2):
+    if actual.shape != expected.shape:
+        raise ValueError("Tensors must have the same shape to compare.")
 
-def test_out_of_place(
-    lib,
-    handle,
-    torch_device,
-    shape,
-    a_stride=None,
-    b_stride=None,
-    c_stride=None,
-    dtype=torch.float16,
-    sync=None,
-):
-    print(
-        f"Testing SwiGLU on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} dtype:{dtype}"
-    )
-    a = torch.rand(shape, dtype=dtype).to(torch_device)
-    b = torch.rand(shape, dtype=dtype).to(torch_device)
-    c = torch.rand(shape, dtype=dtype).to(torch_device)
-
-    if a_stride is not None:
-        a = rearrange_tensor(a, a_stride)
-    if b_stride is not None:
-        b = rearrange_tensor(b, b_stride)
-    if c_stride is not None:
-        c = rearrange_tensor(c, c_stride)
-    ans = swiglu(a, b)
+    # Calculate the difference mask based on atol and rtol
+    diff_mask = torch.abs(actual - expected) > (atol + rtol * torch.abs(expected))
+    diff_indices = torch.nonzero(diff_mask, as_tuple=False)
 
-    if sync is not None:
-        sync()
+    # Widths for columns (adjusted based on data)
+    col_width = [18, 18, 18, 18]
+    decimal_places = [0, 12, 12, 12]
 
-    a_tensor = to_tensor(a, lib)
-    b_tensor = to_tensor(b, lib)
-    c_tensor = to_tensor(c, lib)
-    descriptor = infiniopSwiGLUDescriptor_t()
-    check_error(
-        lib.infiniopCreateSwiGLUDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            c_tensor.descriptor,
-            a_tensor.descriptor,
-            b_tensor.descriptor,
+    for idx in diff_indices:
+        index_tuple = tuple(idx.tolist())
+        print(
+            f" > Index: {str(index_tuple):<{col_width[0]}}"
+            f"actual: \033[31m{actual[index_tuple]:<{col_width[1]}.{decimal_places[1]}f}\033[0m"
+            f"expect: \033[32m{expected[index_tuple]:<{col_width[2]}.{decimal_places[2]}f}\033[0m"
+            f"delta: \033[33m{actual[index_tuple] - expected[index_tuple]:<{col_width[3]}.{decimal_places[3]}f}\033[0m"
         )
-    )
 
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    a_tensor.descriptor.contents.invalidate()
-    b_tensor.descriptor.contents.invalidate()
-    c_tensor.descriptor.contents.invalidate()
+    return diff_indices
 
-    check_error(
-        lib.infiniopSwiGLU(
-            descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
-        )
-    )
 
-    assert torch.allclose(c, ans, atol=1e-4, rtol=1e-2)
-    print("out-of-place Test passed!")
+def swiglu(a, b):
 
-    check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor))
+    return a * b / (1 + torch.exp(-b.float()).to(b.dtype))
 
 
-def test_in_place1(
+def test(
     lib,
     handle,
     torch_device,
     shape,
     a_stride=None,
     b_stride=None,
+    c_stride=None,
     dtype=torch.float16,
+    inplace=Inplace.OUT_OF_PLACE,
     sync=None,
 ):
+    print(
+        f"Testing SwiGLU on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} dtype:{dtype} "
+        f"inplace:{inplace}"
+    )
     a = torch.rand(shape, dtype=dtype).to(torch_device)
     b = torch.rand(shape, dtype=dtype).to(torch_device)
+    c = (
+        torch.rand(shape, dtype=dtype).to(torch_device)
+        if inplace == Inplace.OUT_OF_PLACE
+        else (a if inplace == Inplace.INPLACE_A else b)
+    )
 
     if a_stride is not None:
         a = rearrange_tensor(a, a_stride)
     if b_stride is not None:
         b = rearrange_tensor(b, b_stride)
-    ans = swiglu(a, b)
-
-    if sync is not None:
-        sync()
-
-    a_tensor = to_tensor(a, lib)
-    b_tensor = to_tensor(b, lib)
-    descriptor = infiniopSwiGLUDescriptor_t()
-    check_error(
-        lib.infiniopCreateSwiGLUDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            a_tensor.descriptor,
-            a_tensor.descriptor,
-            b_tensor.descriptor,
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    a_tensor.descriptor.contents.invalidate()
-    b_tensor.descriptor.contents.invalidate()
+    if c_stride is not None and inplace == Inplace.OUT_OF_PLACE:
+        c = rearrange_tensor(c, c_stride)
 
-    check_error(
-        lib.infiniopSwiGLU(
-            descriptor, a_tensor.data, a_tensor.data, b_tensor.data, None
-        )
+    c = (
+        c
+        if inplace == Inplace.OUT_OF_PLACE
+        else (a if inplace == Inplace.INPLACE_A else b)
     )
 
-    assert torch.allclose(a, ans, atol=1e-4, rtol=1e-2)
-    print("in-place1 Test passed!")
-
-    check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor))
-
-
-def test_in_place2(
-    lib,
-    handle,
-    torch_device,
-    shape,
-    a_stride=None,
-    b_stride=None,
-    dtype=torch.float16,
-    sync=None,
-):
-    a = torch.rand(shape, dtype=dtype).to(torch_device)
-    b = torch.rand(shape, dtype=dtype).to(torch_device)
-
-    if a_stride is not None:
-        a = rearrange_tensor(a, a_stride)
-    if b_stride is not None:
-        b = rearrange_tensor(b, b_stride)
     ans = swiglu(a, b)
 
     if sync is not None:
@@ -167,12 +113,17 @@ def test_in_place2(
 
     a_tensor = to_tensor(a, lib)
     b_tensor = to_tensor(b, lib)
+    c_tensor = (
+        to_tensor(c, lib)
+        if inplace == Inplace.OUT_OF_PLACE
+        else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
+    )
     descriptor = infiniopSwiGLUDescriptor_t()
     check_error(
         lib.infiniopCreateSwiGLUDescriptor(
             handle,
             ctypes.byref(descriptor),
-            b_tensor.descriptor,
+            c_tensor.descriptor,
             a_tensor.descriptor,
             b_tensor.descriptor,
         )
@@ -181,83 +132,153 @@ def test_in_place2(
     # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
     a_tensor.descriptor.contents.invalidate()
     b_tensor.descriptor.contents.invalidate()
+    c_tensor.descriptor.contents.invalidate()
 
     check_error(
         lib.infiniopSwiGLU(
-            descriptor, b_tensor.data, a_tensor.data, b_tensor.data, None
+            descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
         )
     )
 
-    assert torch.allclose(b, ans, atol=1e-4, rtol=1e-2)
+    assert torch.allclose(c, ans, atol=1e-4, rtol=1e-2)
+
+    if PROFILE:
+        # Profiling PyTorch implementation
+        for i in range(NUM_PRERUN):
+            _ = swiglu(a, b)
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = swiglu(a, b)
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f" pytorch time: {elapsed * 1000 :6f} ms")
+
+        # Profiling C Operators implementation
+        for i in range(NUM_PRERUN):
+            check_error(
+                lib.infiniopSwiGLU(
+                    descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+                )
+            )
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopSwiGLU(
+                    descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+                )
+            )
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"     lib time: {elapsed * 1000 :6f} ms")
 
     check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor))
 
 
-def test_cpu(lib, test_cases):
+def test_cpu(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
 
-    for shape, a_stride, b_stride, c_stride, dtype in test_cases:
-        test_out_of_place(
-            lib, handle, "cpu", shape, a_stride, b_stride, c_stride, dtype
-        )
-        test_in_place1(lib, handle, "cpu", shape, a_stride, b_stride, dtype)
-        test_in_place2(lib, handle, "cpu", shape, a_stride, b_stride, dtype)
+    for shape, a_stride, b_stride, c_stride, inplace in test_cases:
+        for tensor_dtype in tensor_dtypes:
+            test(
+                lib,
+                handle,
+                "cpu",
+                shape,
+                a_stride,
+                b_stride,
+                c_stride,
+                tensor_dtype,
+                inplace,
+            )
 
     destroy_handle(lib, handle)
 
 
-def test_cuda(lib, test_cases):
+def test_cuda(lib, test_cases, tensor_dtypes):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
 
-    for shape, a_stride, b_stride, c_stride, dtype in test_cases:
-        test_out_of_place(
-            lib, handle, "cuda", shape, a_stride, b_stride, c_stride, dtype
-        )
-        test_in_place1(lib, handle, "cuda", shape, a_stride, b_stride, dtype)
-        test_in_place2(lib, handle, "cuda", shape, a_stride, b_stride, dtype)
+    for shape, a_stride, b_stride, c_stride, inplace in test_cases:
+        for tensor_dtype in tensor_dtypes:
+            test(
+                lib,
+                handle,
+                "cuda",
+                shape,
+                a_stride,
+                b_stride,
+                c_stride,
+                tensor_dtype,
+                inplace,
+            )
 
     destroy_handle(lib, handle)
 
 
-def test_bang(lib, test_cases):
+def test_bang(lib, test_cases, tensor_dtypes):
     import torch_mlu
+
     device = DeviceEnum.DEVICE_BANG
     handle = create_handle(lib, device)
 
-    for shape, a_stride, b_stride, c_stride, dtype in test_cases:
-        test_out_of_place(
-            lib, handle, "mlu", shape, a_stride, b_stride, c_stride, dtype
-        )
-        test_in_place1(lib, handle, "mlu", shape, a_stride, b_stride, dtype)
-        test_in_place2(lib, handle, "mlu", shape, a_stride, b_stride, dtype)
+    for shape, a_stride, b_stride, c_stride, inplace in test_cases:
+        for tensor_dtype in tensor_dtypes:
+            test(
+                lib,
+                handle,
+                "mlu",
+                shape,
+                a_stride,
+                b_stride,
+                c_stride,
+                tensor_dtype,
+                inplace,
+            )
 
     destroy_handle(lib, handle)
 
 
-def test_ascend(lib, test_cases):
+def test_ascend(lib, test_cases, tensor_dtypes):
     import torch_npu
+
     device = DeviceEnum.DEVICE_ASCEND
     handle = create_handle(lib, device)
 
-    for shape, a_stride, b_stride, c_stride, dtype in test_cases:
-        test_out_of_place(
-            lib, handle, "npu", shape, a_stride, b_stride, c_stride, dtype, torch.npu.synchronize
-        )
-        test_in_place1(lib, handle, "npu", shape, a_stride, b_stride, dtype, torch.npu.synchronize)
-        test_in_place2(lib, handle, "npu", shape, a_stride, b_stride, dtype, torch.npu.synchronize)
+    for shape, a_stride, b_stride, c_stride, inplace in test_cases:
+        for tensor_dtype in tensor_dtypes:
+            test(
+                lib,
+                handle,
+                "cpu",
+                shape,
+                a_stride,
+                b_stride,
+                c_stride,
+                tensor_dtype,
+                inplace,
+                torch.npu.synchronize,
+            )
 
-    destroy_handle(lib, handle) 
+    destroy_handle(lib, handle)
 
 
 if __name__ == "__main__":
     test_cases = [
-        # shape, a_stride, b_stride, c_stride, dtype
-        ((13, 4), None, None, None, torch.float16),
-        ((13, 4), (10, 1), (10, 1), (10, 1), torch.float16),
-        ((16, 5632), None, None, None, torch.float16),
-        ((16, 5632), (13312, 1), (13312, 1), (13312, 1), torch.float16),
+        # shape, a_stride, b_stride, c_stride, inplace
+        ((13, 4), None, None, None, Inplace.OUT_OF_PLACE),
+        ((13, 4), None, None, None, Inplace.INPLACE_A),
+        ((13, 4), None, None, None, Inplace.INPLACE_B),
+        ((13, 4), (10, 1), (10, 1), (10, 1), Inplace.OUT_OF_PLACE),
+        ((16, 5632), None, None, None, Inplace.OUT_OF_PLACE),
+        ((16, 5632), (13312, 1), (13312, 1), (13312, 1), Inplace.OUT_OF_PLACE),
+        ((16, 5632), (13312, 1), (13312, 1), (13312, 1), Inplace.INPLACE_A),
+        ((16, 5632), (13312, 1), (13312, 1), (13312, 1), Inplace.INPLACE_B),
+    ]
+    tensor_dtypes = [
+        torch.float16,
     ]
     args = get_args()
     lib = open_lib()
@@ -285,12 +306,16 @@ def test_ascend(lib, test_cases):
         infiniopSwiGLUDescriptor_t,
     ]
 
+    if args.profile:
+        PROFILE = True
     if args.cpu:
-        test_cpu(lib, test_cases)
+        test_cpu(lib, test_cases, tensor_dtypes)
     if args.cuda:
-        test_cuda(lib, test_cases)
+        test_cuda(lib, test_cases, tensor_dtypes)
     if args.bang:
-        test_bang(lib, test_cases)
+        test_bang(lib, test_cases, tensor_dtypes)
     if args.ascend:
-        test_ascend(lib, test_cases)
+        test_ascend(lib, test_cases, tensor_dtypes)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases, tensor_dtypes)
     print("\033[92mTest passed!\033[0m")