From ca2f34cf4a5f082e0448dd4cdd7b194e1aab2d89 Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Thu, 20 Feb 2025 14:05:09 +0800
Subject: [PATCH 1/5] issue/66: modified test py

---
 test/infiniop/causal_softmax.py   | 179 ++++++++++---------
 test/infiniop/random_sample.py    | 162 ++++++++---------
 test/infiniop/rearrange.py        | 172 +++++++++---------
 test/infiniop/rms_norm.py         | 201 ++++++++++-----------
 test/infiniop/rotary_embedding.py |  80 +++++----
 test/infiniop/swiglu.py           | 287 ++++++++++++++----------------
 6 files changed, 544 insertions(+), 537 deletions(-)

diff --git a/test/infiniop/causal_softmax.py b/test/infiniop/causal_softmax.py
index a5c66bfbb..64ba65acc 100644
--- a/test/infiniop/causal_softmax.py
+++ b/test/infiniop/causal_softmax.py
@@ -1,26 +1,50 @@
-from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
+import torch
 import ctypes
-import sys
-import os
-
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
-from operatorspy import (
-    open_lib,
-    to_tensor,
-    DeviceEnum,
+from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
+from libinfiniop import (
     infiniopHandle_t,
     infiniopTensorDescriptor_t,
-    create_handle,
-    destroy_handle,
+    open_lib,
+    to_tensor,
+    get_test_devices,
     check_error,
-    rearrange_tensor,
+    rearrange_if_needed,
     create_workspace,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
 )
 
-from operatorspy.tests.test_utils import get_args
-import torch
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
 
+_TEST_CASES = [
+        # x_shape, x_stride
+        ((32, 512), None),
+        ((32, 512), (1024, 1)),
+        ((32, 5, 5), None),
+        ((32, 20, 512), None),
+        ((32, 20, 512), (20480, 512, 1)),  # Ascend 暂不支持非连续
+        ((32, 20, 4, 512), None),
+        ((32, 20, 4, 512), (81920, 2048, 512, 1)),
+    ]
+# Data types used for testing
+_TENSOR_DTYPES = [torch.float16, torch.float32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    torch.float16: {'atol': 0, 'rtol': 1e-2},
+    torch.float32: {'atol': 0, 'rtol': 1e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
 
 class CausalSoftmaxDescriptor(Structure):
     _fields_ = [("device", c_int32)]
@@ -37,88 +61,78 @@ def causal_softmax(x):
     return torch.nn.functional.softmax(masked, dim=-1).to(type)
 
 
-def test(lib, handle, torch_device, x_shape, x_stride=None, x_dtype=torch.float16):
+def test(
+    lib, 
+    handle, 
+    torch_device, 
+    x_shape, 
+    x_stride=None, 
+    dtype=torch.float16
+):
     print(
-        f"Testing CausalSoftmax on {torch_device} with x_shape:{x_shape} x_stride:{x_stride} dtype:{x_dtype}"
+        f"Testing CausalSoftmax on {torch_device} with x_shape:{x_shape} x_stride:{x_stride} dtype:{dtype}"
     )
-    x = torch.rand(x_shape, dtype=x_dtype).to(torch_device)
-    if x_stride is not None:
-        x = rearrange_tensor(x, x_stride)
+    x = torch.rand(x_shape, dtype=dtype).to(torch_device)
+
     ans = causal_softmax(x)
+
+    
+    x = rearrange_if_needed(x, x_stride)
+    
     x_tensor = to_tensor(x, lib)
+
     descriptor = infiniopCausalSoftmaxDescriptor_t()
     check_error(
         lib.infiniopCreateCausalSoftmaxDescriptor(
-            handle, ctypes.byref(descriptor), x_tensor.descriptor
+            handle, 
+            ctypes.byref(descriptor),
+            x_tensor.descriptor
         )
     )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+
+
     workspace_size = c_uint64(0)
     check_error(
         lib.infiniopGetCausalSoftmaxWorkspaceSize(
             descriptor, ctypes.byref(workspace_size)
         )
     )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    x_tensor.descriptor.contents.invalidate()
-
     workspace = create_workspace(workspace_size.value, x.device)
-    check_error(
-        lib.infiniopCausalSoftmax(
-            descriptor,
-            workspace.data_ptr() if workspace is not None else None,
-            workspace_size.value,
-            x_tensor.data,
-            None,
+    def lib_causal_softmax():
+        check_error(
+            lib.infiniopCausalSoftmax(
+                descriptor,
+                workspace.data_ptr() if workspace is not None else None,
+                workspace_size.value,
+                x_tensor.data,
+                None,
+            )
         )
-    )
-    assert torch.allclose(x, ans, atol=0, rtol=1e-2)
-    check_error(lib.infiniopDestroyCausalSoftmaxDescriptor(descriptor))
-
-
-def test_cpu(lib, test_cases):
-    device = DeviceEnum.DEVICE_CPU
-    handle = create_handle(lib, device)
-    for x_shape, x_stride in test_cases:
-        test(lib, handle, "cpu", x_shape, x_stride)
-    destroy_handle(lib, handle)
-
-
-def test_cuda(lib, test_cases):
-    device = DeviceEnum.DEVICE_CUDA
-    handle = create_handle(lib, device)
-    for x_shape, x_stride in test_cases:
-        test(lib, handle, "cuda", x_shape, x_stride)
-    destroy_handle(lib, handle)
-
-
-def test_bang(lib, test_cases):
-    import torch_mlu
-
-    device = DeviceEnum.DEVICE_BANG
-    handle = create_handle(lib, device)
-    for x_shape, x_stride in test_cases:
-        test(lib, handle, "mlu", x_shape, x_stride)
-    destroy_handle(lib, handle)
+    lib_causal_softmax()
+    
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(x, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(x, ans, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: causal_softmax(x), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_causal_softmax(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
 
+    check_error(lib.infiniopDestroyCausalSoftmaxDescriptor(descriptor))
 
-def test_ascend(lib, test_cases):
-    import torch_npu
 
-    device = DeviceEnum.DEVICE_ASCEND
-    handle = create_handle(lib, device)
-    for x_shape, x_stride in test_cases:
-        test(lib, handle, "npu", x_shape, x_stride)
 
-    destroy_handle(lib, handle)
 
 
 if __name__ == "__main__":
-    test_cases = [
-        # x_shape, x_stride
-        ((32, 20, 512), None),
-        ((32, 20, 512), (20480, 512, 1)),  # Ascend 暂不支持非连续
-    ]
+    
     args = get_args()
     lib = open_lib()
     lib.infiniopCreateCausalSoftmaxDescriptor.restype = c_int32
@@ -144,15 +158,14 @@ def test_ascend(lib, test_cases):
     lib.infiniopDestroyCausalSoftmaxDescriptor.argtypes = [
         infiniopCausalSoftmaxDescriptor_t,
     ]
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+    
+    for device in get_test_devices(args):
+        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
 
-    if args.cpu:
-        test_cpu(lib, test_cases)
-    if args.cuda:
-        test_cuda(lib, test_cases)
-    if args.bang:
-        test_bang(lib, test_cases)
-    if args.ascend:
-        test_ascend(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang or args.ascend):
-        test_cpu(lib, test_cases)
     print("\033[92mTest passed!\033[0m")
+
diff --git a/test/infiniop/random_sample.py b/test/infiniop/random_sample.py
index a5eb143ab..c2f4f0e5a 100644
--- a/test/infiniop/random_sample.py
+++ b/test/infiniop/random_sample.py
@@ -1,25 +1,47 @@
-from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float
+import torch
 import ctypes
-import sys
-import os
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
-from operatorspy import (
-    open_lib,
-    to_tensor,
-    DeviceEnum,
+from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
+from libinfiniop import (
     infiniopHandle_t,
     infiniopTensorDescriptor_t,
-    create_handle,
-    destroy_handle,
+    open_lib,
+    to_tensor,
+    get_test_devices,
     check_error,
-    rearrange_tensor,
+    rearrange_if_needed,
     create_workspace,
-    U64,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
 )
 
-from operatorspy.tests.test_utils import get_args
-import torch
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES = [
+    # voc, random_val, topp, topk, temperature
+        (512, 0.8, 0.8, 3, 0.5),
+        (4096, 0.05, 0.9, 5, 1.0),
+        (16384, 0.15, 0.85, 10, 2.0),
+        (512, 0.08, 0, 3, 0.5),
+        (4096, 0.5, 0.9, 1, 1.0),
+        (16384, 0.15, 0, 1, 2.0),
+        (16384, 0.15, 0, 1, 2.0),
+        (32000, 0.08, 0.8, 50, 1.0),
+        (32000, 0.08, 1.0, 25, 1.0),
+        # (119696, 0.01, 1.0, 100, 1.0),
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [torch.float16, torch.float32]
+
+
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
 
 
 class RandomSampleDescriptor(Structure):
@@ -116,8 +138,8 @@ def test(
     )
 
     # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    x_tensor.descriptor.contents.invalidate()
-    indices_tensor.descriptor.contents.invalidate()
+    for tensor in [x_tensor, indices_tensor]:
+        tensor.descriptor.contents.invalidate()
 
     workspace_size = c_uint64(0)
     check_error(
@@ -126,77 +148,45 @@ def test(
         )
     )
     workspace = create_workspace(workspace_size.value, torch_device)
-    check_error(
-        lib.infiniopRandomSample(
-            descriptor,
-            workspace.data_ptr() if workspace is not None else None,
-            workspace_size.value,
-            indices_tensor.data,
-            x_tensor.data,
-            random_val,
-            topp,
-            topk,
-            temperature,
-            None,
+    
+    def lib_random_sample():
+        check_error(
+            lib.infiniopRandomSample(
+                descriptor,
+                workspace.data_ptr() if workspace is not None else None,
+                workspace_size.value,
+                indices_tensor.data,
+                x_tensor.data,
+                random_val,
+                topp,
+                topk,
+                temperature,
+                None,
+            )
         )
-    )
     if torch_device == "npu":
         torch.npu.synchronize()
 
     assert indices[0].type(ans.dtype) == ans or data[ans] == data[indices[0]]
+    
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        if topp > 0 and topk > 1:
+            profile_operation("PyTorch", lambda: random_sample(
+                data.to("cpu"), random_val, topp, topk, voc, temperature, "cpu"
+            ), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        else:
+            profile_operation("PyTorch", lambda: random_sample_0(data), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        
+        profile_operation("    lib", lambda: lib_random_sample(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
     check_error(lib.infiniopDestroyRandomSampleDescriptor(descriptor))
 
 
-def test_cpu(lib, test_cases):
-    device = DeviceEnum.DEVICE_CPU
-    handle = create_handle(lib, device)
-    for voc, random_val, topp, topk, temperature in test_cases:
-        test(lib, handle, "cpu", voc, random_val, topp, topk, temperature)
-    destroy_handle(lib, handle)
-
-
-def test_cuda(lib, test_cases):
-    device = DeviceEnum.DEVICE_CUDA
-    handle = create_handle(lib, device)
-    for voc, random_val, topp, topk, temperature in test_cases:
-        test(lib, handle, "cuda", voc, random_val, topp, topk, temperature)
-    destroy_handle(lib, handle)
-
-
-def test_bang(lib, test_cases):
-    import torch_mlu
-
-    device = DeviceEnum.DEVICE_BANG
-    handle = create_handle(lib, device)
-    for voc, random_val, topp, topk, temperature in test_cases:
-        test(lib, handle, "mlu", voc, random_val, topp, topk, temperature)
-    destroy_handle(lib, handle)
-
-
-def test_ascend(lib, test_cases):
-    import torch_npu
-
-    device = DeviceEnum.DEVICE_ASCEND
-    handle = create_handle(lib, device)
-    for voc, random_val, topp, topk, temperature in test_cases:
-        test(lib, handle, "npu", voc, random_val, topp, topk, temperature)
-    destroy_handle(lib, handle)
 
 
 if __name__ == "__main__":
-    test_cases = [
-        # voc, random_val, topp, topk, temperature
-        (512, 0.8, 0.8, 3, 0.5),
-        (4096, 0.05, 0.9, 5, 1.0),
-        (16384, 0.15, 0.85, 10, 2.0),
-        (512, 0.08, 0, 3, 0.5),
-        (4096, 0.5, 0.9, 1, 1.0),
-        (16384, 0.15, 0, 1, 2.0),
-        (16384, 0.15, 0, 1, 2.0),
-        (32000, 0.08, 0.8, 50, 1.0),
-        (32000, 0.08, 1.0, 25, 1.0),
-        # (119696, 0.01, 1.0, 100, 1.0),
-    ]
 
     args = get_args()
     lib = open_lib()
@@ -229,14 +219,12 @@ def test_ascend(lib, test_cases):
         infiniopRandomSampleDescriptor_t,
     ]
 
-    if args.cpu:
-        test_cpu(lib, test_cases)
-    if args.cuda:
-        test_cuda(lib, test_cases)
-    if args.bang:
-        test_bang(lib, test_cases)
-    if args.ascend:
-        test_ascend(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang or args.ascend):
-        test_cpu(lib, test_cases)
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    # Execute tests
+    for device in get_test_devices(args):
+        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
+
     print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/rearrange.py b/test/infiniop/rearrange.py
index f9d5306c5..955ee1719 100644
--- a/test/infiniop/rearrange.py
+++ b/test/infiniop/rearrange.py
@@ -1,24 +1,51 @@
+import torch
 import ctypes
-from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
-import sys
-import os
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
-from operatorspy import (
-    open_lib,
-    to_tensor,
-    CTensor,
-    DeviceEnum,
+from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
+from libinfiniop import (
     infiniopHandle_t,
     infiniopTensorDescriptor_t,
-    create_handle,
-    destroy_handle,
+    open_lib,
+    to_tensor,
+    get_test_devices,
     check_error,
-    rearrange_tensor,
+    rearrange_if_needed,
+    create_workspace,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
 )
 
-from operatorspy.tests.test_utils import get_args
-import torch
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES = [
+    # ((src_shape, src_stride), (dst_shape, dst_stride))
+        (((2, 4, 32), None), ((2, 4, 32), (256, 64, 1))),
+        (((32, 6, 64), (64, 2560, 1)), ((32, 6, 64), None)),
+        (((4, 6, 64), (64, 2560, 1)), ((4, 6, 64), (131072, 64, 1))),
+        (((1, 32, 64), (2048, 64, 1)), ((1, 32, 64), (2048, 64, 1))),
+        (((32, 1, 64), (64, 2560, 1)), ((32, 1, 64), (64, 64, 1))),
+        (((4, 1, 64), (64, 2560, 1)), ((4, 1, 64), (64, 11264, 1))),
+        (((64,), (1,)), ((64,), (1,))),
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [torch.float16, torch.float32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    torch.float16: {"atol": 0, "rtol": 1e-3},
+    torch.float32: {"atol": 0, "rtol": 1e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
 
 
 class RerrangeDescriptor(Structure):
@@ -43,12 +70,13 @@ def test(
     )
     x = torch.rand(x_shape, dtype=x_dtype).to(torch_device)
     y = torch.zeros(y_shape, dtype=x_dtype).to(torch_device)
-    if x_stride is not None:
-        x = rearrange_tensor(x, x_stride)
-    if y_stride is not None:
-        y = rearrange_tensor(y, y_stride)
-    x_tensor = to_tensor(x, lib)
-    y_tensor = to_tensor(y, lib)
+    
+    x, y = [
+        rearrange_if_needed(tensor, stride)
+        for tensor, stride in zip([x, y], [x_stride, y_stride])
+    ]
+    x_tensor, y_tensor = [to_tensor(tensor, lib) for tensor in [x, y]]
+
 
     descriptor = infiniopRearrangeDescriptor_t()
     check_error(
@@ -58,71 +86,42 @@ def test(
     )
 
     # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    x_tensor.descriptor.contents.invalidate()
-    y_tensor.descriptor.contents.invalidate()
+    for tensor in [x_tensor, y_tensor]:
+        tensor.descriptor.contents.invalidate()
+
+    def lib_rearrange():
+        check_error(
+            lib.infiniopRearrange(
+                descriptor, 
+                y_tensor.data, 
+                x_tensor.data, 
+                None
+            )
+        )
+    lib_rearrange()
+    
+    # Validate results
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(x, y, atol=atol, rtol=rtol)
+    assert torch.allclose(x, y, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: rearrange_tensor(y, y_stride), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_rearrange(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
 
-    check_error(lib.infiniopRearrange(descriptor, y_tensor.data, x_tensor.data, None))
-    assert torch.allclose(x, y, atol=0, rtol=1e-3)
     check_error(lib.infiniopDestroyRearrangeDescriptor(descriptor))
 
 
-def test_cpu(lib, test_cases):
-    device = DeviceEnum.DEVICE_CPU
-    handle = create_handle(lib, device)
-    for test_case in test_cases:
-        x_shape, x_stride = test_case[0]
-        y_shape, y_stride = test_case[1]
-        test(lib, handle, "cpu", x_shape, x_stride, y_shape, y_stride)
-    destroy_handle(lib, handle)
-
-
-def test_cuda(lib, test_cases):
-    device = DeviceEnum.DEVICE_CUDA
-    handle = create_handle(lib, device)
-    for test_case in test_cases:
-        x_shape, x_stride = test_case[0]
-        y_shape, y_stride = test_case[1]
-        test(lib, handle, "cuda", x_shape, x_stride, y_shape, y_stride)
-    destroy_handle(lib, handle)
-
-
-def test_bang(lib, test_cases):
-    import torch_mlu
-
-    device = DeviceEnum.DEVICE_BANG
-    handle = create_handle(lib, device)
-    for test_case in test_cases:
-        x_shape, x_stride = test_case[0]
-        y_shape, y_stride = test_case[1]
-        test(lib, handle, "mlu", x_shape, x_stride, y_shape, y_stride)
-    destroy_handle(lib, handle)
-
-
-def test_ascend(lib, test_cases):
-    import torch_npu
-
-    device = DeviceEnum.DEVICE_ASCEND
-    handle = create_handle(lib, device)
-    for test_case in test_cases:
-        x_shape, x_stride = test_case[0]
-        y_shape, y_stride = test_case[1]
-        test(lib, handle, "npu", x_shape, x_stride, y_shape, y_stride)
-    destroy_handle(lib, handle)
 
 
 if __name__ == "__main__":
     args = get_args()
-    test_cases = [
-        # ((src_shape, src_stride), (dst_shape, dst_stride))
-        (((2, 4, 32), None), ((2, 4, 32), (256, 64, 1))),
-        (((32, 6, 64), (64, 2560, 1)), ((32, 6, 64), None)),
-        (((4, 6, 64), (64, 2560, 1)), ((4, 6, 64), (131072, 64, 1))),
-        (((1, 32, 64), (2048, 64, 1)), ((1, 32, 64), (2048, 64, 1))),
-        (((32, 1, 64), (64, 2560, 1)), ((32, 1, 64), (64, 64, 1))),
-        (((4, 1, 64), (64, 2560, 1)), ((4, 1, 64), (64, 11264, 1))),
-        (((64,), (1,)), ((64,), (1,))),
-    ]
     lib = open_lib()
+
     lib.infiniopCreateRearrangeDescriptor.restype = c_int32
     lib.infiniopCreateRearrangeDescriptor.argtypes = [
         infiniopHandle_t,
@@ -139,12 +138,15 @@ def test_ascend(lib, test_cases):
     ]
     lib.infiniopDestroyRearrangeDescriptor.restype = c_int32
     lib.infiniopDestroyRearrangeDescriptor.argtypes = [infiniopRearrangeDescriptor_t]
-    if args.cpu:
-        test_cpu(lib, test_cases)
-    if args.cuda:
-        test_cuda(lib, test_cases)
-    if args.bang:
-        test_bang(lib, test_cases)
-    if args.ascend:
-        test_ascend(lib, test_cases)
+    
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    # Execute tests
+    for device in get_test_devices(args):
+        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
+
     print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/rms_norm.py b/test/infiniop/rms_norm.py
index 21e27348e..0adf37241 100644
--- a/test/infiniop/rms_norm.py
+++ b/test/infiniop/rms_norm.py
@@ -1,25 +1,49 @@
 from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float
 import ctypes
-import sys
-import os
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
-from operatorspy import (
-    open_lib,
-    to_tensor,
-    DeviceEnum,
+import torch
+import ctypes
+from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
+from libinfiniop import (
     infiniopHandle_t,
     infiniopTensorDescriptor_t,
-    create_handle,
-    destroy_handle,
+    open_lib,
+    to_tensor,
+    get_test_devices,
     check_error,
-    rearrange_tensor,
+    rearrange_if_needed,
     create_workspace,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
 )
 
-from operatorspy.tests.test_utils import get_args
-import torch
-
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+
+_TEST_CASES = [
+    # y_shape, x_shape, w_shape, y_stride, x_stride, w_dtype
+    ((16, 2048), (16, 2048), (2048,), None, None,torch.float32),
+    ((16, 2048), (16, 2048), (2048,), None, None, torch.float16),
+    ((16, 2048), (16, 2048), (2048,), (4096, 1), (4096, 1),torch.float32),
+    ((16, 2048), (16, 2048), (2048,), (4096, 1), (4096, 1), torch.float16),
+]
+# x types used for testing
+_TENSOR_DTYPES = [torch.float16, torch.float32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    torch.float16: {"atol": 0, "rtol": 1e-2},
+    torch.float32: {"atol": 0, "rtol": 1e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
 
 class RMSNormDescriptor(Structure):
     _fields_ = [("device", c_int32)]
@@ -27,7 +51,6 @@ class RMSNormDescriptor(Structure):
 
 infiniopRMSNormDescriptor_t = POINTER(RMSNormDescriptor)
 
-
 def rms_norm(x, w, eps):
     input_dtype = x.dtype
     hidden_states = x.to(torch.float32)
@@ -37,19 +60,18 @@ def rms_norm(x, w, eps):
 
 
 def test(
-    lib,
-    handle,
-    torch_device,
-    y_shape,
-    x_shape,
-    w_shape,
-    dtype=torch.float16,
-    w_dtype=torch.float16,
-):
-    print(
-        f"Testing RMS_Norm on {torch_device} with y_shape:{y_shape} x_shape:{x_shape} w_shape:{w_shape}"
-        f" dtype:{dtype} w_dtype:{w_dtype}"
-    )
+    lib, 
+    handle, 
+    torch_device, 
+    y_shape, 
+    x_shape, 
+    w_shape, 
+    y_stride,
+    x_stride,
+    dtype=torch.float16, 
+    w_dtype=torch.float16):
+    print(f"Testing RMS_Norm on {torch_device} with y_shape:{y_shape} x_shape:{x_shape} w_shape:{w_shape}"
+        f" dtype:{dtype} w_dtype:{w_dtype}")
 
     y = torch.zeros(y_shape, dtype=dtype).to(torch_device)
     x = torch.rand(x_shape, dtype=dtype).to(torch_device)
@@ -58,93 +80,64 @@ def test(
     eps = 1e-5
     ans = rms_norm(x, w, eps)
 
-    y_tensor = to_tensor(y, lib)
-    x_tensor = to_tensor(x, lib)
-    w_tensor = to_tensor(w, lib)
+    x = rearrange_if_needed(x, x_stride)
+    y = rearrange_if_needed(y, y_stride)
+
+    x_tensor, y_tensor, w_tensor = [to_tensor(tensor, lib) for tensor in [x, y, w]]
 
     descriptor = infiniopRMSNormDescriptor_t()
-    w_dataType = 0 if w_dtype == torch.float16 else 1
+    w_dataType = 0 if w_dtype==torch.float16 else 1
 
     check_error(
         lib.infiniopCreateRMSNormDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            y_tensor.descriptor,
-            x_tensor.descriptor,
-            w_tensor.descriptor,
-            eps,
+            handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor,
+            w_tensor.descriptor, eps
         )
     )
 
     # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    x_tensor.descriptor.contents.invalidate()
-    y_tensor.descriptor.contents.invalidate()
-    w_tensor.descriptor.contents.invalidate()
+    for tensor in [x_tensor, y_tensor, w_tensor]:
+        tensor.descriptor.contents.invalidate()
 
     workspace_size = c_uint64(0)
     check_error(
-        lib.infiniopGetRMSNormWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+        lib.infiniopGetRMSNormWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
     )
     workspace = create_workspace(workspace_size.value, y.device)
-    check_error(
-        lib.infiniopRMSNorm(
-            descriptor,
-            workspace.data_ptr() if workspace is not None else None,
-            workspace_size.value,
-            y_tensor.data,
-            x_tensor.data,
-            w_tensor.data,
-            None,
+    def lib_rms_norm():
+        check_error(
+            lib.infiniopRMSNorm(
+                descriptor,
+                workspace.data_ptr() if workspace is not None else None,
+                workspace_size.value,
+                y_tensor.data,
+                x_tensor.data,
+                w_tensor.data,
+                None,
+            )
         )
-    )
 
-    assert torch.allclose(y.to(dtype), ans.to(dtype), atol=1e-3, rtol=1e-3)
+    lib_rms_norm()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y, ans, atol=atol, rtol=rtol)
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: rms_norm(x, w, eps), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_rms_norm(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
     check_error(lib.infiniopDestroyRMSNormDescriptor(descriptor))
 
 
-def test_cpu(lib, test_cases):
-    device = DeviceEnum.DEVICE_CPU
-    handle = create_handle(lib, device)
-    for y_shape, x_shape, w_shape, dtype, w_dtype in test_cases:
-        test(lib, handle, "cpu", y_shape, x_shape, w_shape, dtype, w_dtype)
-    destroy_handle(lib, handle)
-
-
-def test_cuda(lib, test_cases):
-    device = DeviceEnum.DEVICE_CUDA
-    handle = create_handle(lib, device)
-    for y_shape, x_shape, w_shape, dtype, w_dtype in test_cases:
-        test(lib, handle, "cuda", y_shape, x_shape, w_shape, dtype, w_dtype)
-    destroy_handle(lib, handle)
-
-
-def test_bang(lib, test_cases):
-    import torch_mlu
-
-    device = DeviceEnum.DEVICE_BANG
-    handle = create_handle(lib, device)
-    for y_shape, x_shape, w_shape, dtype, w_dtype in test_cases:
-        test(lib, handle, "mlu", y_shape, x_shape, w_shape, dtype, w_dtype)
-    destroy_handle(lib, handle)
-
-
-def test_ascend(lib, test_cases):
-    import torch_npu
-
-    device = DeviceEnum.DEVICE_ASCEND
-    handle = create_handle(lib, device)
-    for y_shape, x_shape, w_shape, dtype, w_dtype in test_cases:
-        test(lib, handle, "npu", y_shape, x_shape, w_shape, dtype, w_dtype)
-
-    destroy_handle(lib, handle)
 
 
 if __name__ == "__main__":
-    test_cases = [
-        # y_shape, x_shape, w_shape, dtype, w_dtype
-        ((16, 2048), (16, 2048), (2048,), torch.float16, torch.float16),
-        ((16, 2048), (16, 2048), (2048,), torch.float16, torch.float32),
-    ]
+    
     args = get_args()
     lib = open_lib()
     lib.infiniopCreateRMSNormDescriptor.restype = c_int32
@@ -178,14 +171,16 @@ def test_ascend(lib, test_cases):
         infiniopRMSNormDescriptor_t,
     ]
 
-    if args.cpu:
-        test_cpu(lib, test_cases)
-    if args.cuda:
-        test_cuda(lib, test_cases)
-    if args.bang:
-        test_bang(lib, test_cases)
-    if args.ascend:
-        test_ascend(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang or args.ascend):
-        test_cpu(lib, test_cases)
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    # Execute tests
+    for device in get_test_devices(args):
+        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
+
     print("\033[92mTest passed!\033[0m")
+
+
diff --git a/test/infiniop/rotary_embedding.py b/test/infiniop/rotary_embedding.py
index 9e9a29866..88fde17d0 100644
--- a/test/infiniop/rotary_embedding.py
+++ b/test/infiniop/rotary_embedding.py
@@ -1,9 +1,6 @@
+import torch
 import ctypes
-from ctypes import POINTER, c_void_p, c_int32, c_uint64, Structure, byref
-import sys
-import os
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
 from libinfiniop import (
     infiniopHandle_t,
     infiniopTensorDescriptor_t,
@@ -16,10 +13,33 @@
     test_operator,
     get_args,
     debug,
+    get_tolerance,
     profile_operation,
-    InfiniDtype,
 )
-import torch
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES = [
+    # (t_shape, t_strides)
+        ((1, 32, 128), None),
+        ((1, 32, 64), None),
+        # 昇腾暂不满足这个用例，最后一维度 <=32 会有问题，可能与其核心
+        # 接口 GatherMask 的内部实现相关，目前 48 64 128 都可以支持
+        ((4, 1, 32), None),
+        ((1, 32, 128), None),
+        ((3, 32, 128), (8000, 200, 1)),
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [torch.float16, torch.float32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    torch.float16: {"atol": 0, "rtol": 1e-2},
+    torch.float32: {"atol": 0, "rtol": 1e-3},
+}
 
 DEBUG = False
 PROFILE = False
@@ -27,6 +47,7 @@
 NUM_ITERATIONS = 1000
 
 
+
 class RoPEDescriptor(Structure):
     _fields_ = [("device", c_int32)]
 
@@ -75,13 +96,22 @@ def sin_cos_table(max_seq_len, dim, torch_device, theta):
     return torch.sin(angles), torch.cos(angles)
 
 
-def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16):
+def test(
+    lib, 
+    handle, 
+    torch_device, 
+    shape, 
+    strides=None, 
+    dtype=torch.float16
+):
     print(
         f"Testing Rotary Positional Embedding on {torch_device} with shape:{shape} strides:{strides} and dtype:{dtype}"
     )
 
     t = torch.rand(shape, dtype=dtype)
-    t = rearrange_if_needed(t, strides).to(torch_device)
+
+    t = rearrange_if_needed(t, strides)
+
     posTmp = torch.arange(0, t.shape[0]).to(torch_device)
     pos = torch.zeros(2 * posTmp.shape[0], dtype=torch.int32)
     for i in range(posTmp.shape[0]):
@@ -95,11 +125,12 @@ def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16):
     descriptor = infiniopRoPEDescriptor_t()
     # 2x table length for test
     sin_table, cos_table = sin_cos_table(t.shape[0] * 2, t.shape[2], t.device, theta)
-    t_tensor = to_tensor(t, lib)
+
+    t_tensor, sin_table_tensor, cos_table_tensor = [to_tensor(tensor, lib) for tensor in [t, sin_table, cos_table]]
+    
     pos_tensor = to_tensor(pos[: t.shape[0]], lib)
     pos_tensor.descriptor.contents.dtype = InfiniDtype.U64
-    sin_table_tensor = to_tensor(sin_table, lib)
-    cos_table_tensor = to_tensor(cos_table, lib)
+    
 
     if torch_device == "npu":
         torch.npu.synchronize()
@@ -116,10 +147,8 @@ def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16):
     )
 
     # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    t_tensor.descriptor.contents.invalidate()
-    pos_tensor.descriptor.contents.invalidate()
-    sin_table_tensor.descriptor.contents.invalidate()
-    cos_table_tensor.descriptor.contents.invalidate()
+    for tensor in [t_tensor, pos_tensor, sin_table_tensor, cos_table_tensor]:
+        tensor.descriptor.contents.invalidate()
 
     workspace_size = c_uint64(0)
     check_error(
@@ -142,9 +171,11 @@ def lib_rope():
         )
 
     lib_rope()
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
     if DEBUG:
-        debug(t, ans, atol=1e-4, rtol=1e-2)
-    assert torch.allclose(t, ans, atol=1e-4, rtol=1e-2)
+        debug(t, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(t, ans, atol=atol, rtol=rtol)
+    
     if PROFILE:
         profile_operation(
             "PyTorch",
@@ -161,17 +192,6 @@ def lib_rope():
 
 
 if __name__ == "__main__":
-    test_cases = [
-        # (t_shape, t_strides)
-        ((1, 32, 128), None),
-        ((1, 32, 64), None),
-        # 昇腾暂不满足这个用例，最后一维度 <=32 会有问题，可能与其核心
-        # 接口 GatherMask 的内部实现相关，目前 48 64 128 都可以支持
-        ((4, 1, 32), None),
-        ((1, 32, 128), None),
-        ((3, 32, 128), (8000, 200, 1)),
-    ]
-    test_dtypes = [torch.float16]
     args = get_args()
     lib = open_lib()
     lib.infiniopCreateRoPEDescriptor.restype = c_int32
@@ -211,5 +231,5 @@ def lib_rope():
 
     # Execute tests
     for device in get_test_devices(args):
-        test_operator(lib, device, test, test_cases, test_dtypes)
+        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
     print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/swiglu.py b/test/infiniop/swiglu.py
index 67b3c2b85..dd5608ab5 100644
--- a/test/infiniop/swiglu.py
+++ b/test/infiniop/swiglu.py
@@ -1,25 +1,50 @@
-from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
+import torch
 import ctypes
-import sys
-import os
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
-from operatorspy import (
-    open_lib,
-    to_tensor,
-    CTensor,
-    DeviceEnum,
+from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
+from libinfiniop import (
     infiniopHandle_t,
     infiniopTensorDescriptor_t,
-    create_handle,
-    destroy_handle,
+    open_lib,
+    to_tensor,
+    get_test_devices,
     check_error,
-    rearrange_tensor,
+    rearrange_if_needed,
+    create_workspace,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
 )
 
-from operatorspy.tests.test_utils import get_args
-import torch
-
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+# Data types used for testing
+_TENSOR_DTYPES = [torch.float16, torch.float32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    torch.float16: {'atol': 0, 'rtol': 1e-2},
+    torch.float32: {'atol': 0, 'rtol': 1e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
 
 class SwiGLUDescriptor(Structure):
     _fields_ = [("device", c_int32)]
@@ -51,20 +76,18 @@ def test_out_of_place(
     b = torch.rand(shape, dtype=dtype).to(torch_device)
     c = torch.rand(shape, dtype=dtype).to(torch_device)
 
-    if a_stride is not None:
-        a = rearrange_tensor(a, a_stride)
-    if b_stride is not None:
-        b = rearrange_tensor(b, b_stride)
-    if c_stride is not None:
-        c = rearrange_tensor(c, c_stride)
     ans = swiglu(a, b)
 
+    a, b, c = [
+        rearrange_if_needed(tensor, stride)
+        for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_stride])
+    ]
+    a_tensor, b_tensor, c_tensor = [to_tensor(tensor, lib) for tensor in [a, b, c]]
+
+
     if sync is not None:
         sync()
 
-    a_tensor = to_tensor(a, lib)
-    b_tensor = to_tensor(b, lib)
-    c_tensor = to_tensor(c, lib)
     descriptor = infiniopSwiGLUDescriptor_t()
     check_error(
         lib.infiniopCreateSwiGLUDescriptor(
@@ -77,19 +100,33 @@ def test_out_of_place(
     )
 
     # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    a_tensor.descriptor.contents.invalidate()
-    b_tensor.descriptor.contents.invalidate()
-    c_tensor.descriptor.contents.invalidate()
-
-    check_error(
-        lib.infiniopSwiGLU(
-            descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+    for tensor in [a_tensor, b_tensor, c_tensor]:
+        tensor.descriptor.contents.invalidate()
+
+    def lib_swiglu():
+        check_error(
+            lib.infiniopSwiGLU(
+                descriptor, 
+                c_tensor.data, 
+                a_tensor.data, 
+                b_tensor.data, 
+                None
+            )
         )
-    )
+    lib_swiglu()
 
-    assert torch.allclose(c, ans, atol=1e-4, rtol=1e-2)
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(c, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(c, ans, atol=atol, rtol=rtol)
     print("out-of-place Test passed!")
 
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: swiglu(a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_swiglu(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
     check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor))
 
 
@@ -106,18 +143,19 @@ def test_in_place1(
     a = torch.rand(shape, dtype=dtype).to(torch_device)
     b = torch.rand(shape, dtype=dtype).to(torch_device)
 
-    if a_stride is not None:
-        a = rearrange_tensor(a, a_stride)
-    if b_stride is not None:
-        b = rearrange_tensor(b, b_stride)
     ans = swiglu(a, b)
 
     if sync is not None:
         sync()
 
-    a_tensor = to_tensor(a, lib)
-    b_tensor = to_tensor(b, lib)
+    a, b = [
+        rearrange_if_needed(tensor, stride)
+        for tensor, stride in zip([a, b], [a_stride, b_stride])
+    ]
+    a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
+
     descriptor = infiniopSwiGLUDescriptor_t()
+    
     check_error(
         lib.infiniopCreateSwiGLUDescriptor(
             handle,
@@ -129,18 +167,27 @@ def test_in_place1(
     )
 
     # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    a_tensor.descriptor.contents.invalidate()
-    b_tensor.descriptor.contents.invalidate()
-
-    check_error(
-        lib.infiniopSwiGLU(
-            descriptor, a_tensor.data, a_tensor.data, b_tensor.data, None
+    for tensor in [a_tensor, b_tensor]:
+        tensor.descriptor.contents.invalidate()
+    def lib_swiglu():
+        check_error(
+            lib.infiniopSwiGLU(
+                descriptor, a_tensor.data, a_tensor.data, b_tensor.data, None
+            )
         )
-    )
+    lib_swiglu()
 
-    assert torch.allclose(a, ans, atol=1e-4, rtol=1e-2)
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(a, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(a, ans, atol=atol, rtol=rtol)
     print("in-place1 Test passed!")
-
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: swiglu(a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_swiglu(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
     check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor))
 
 
@@ -157,17 +204,17 @@ def test_in_place2(
     a = torch.rand(shape, dtype=dtype).to(torch_device)
     b = torch.rand(shape, dtype=dtype).to(torch_device)
 
-    if a_stride is not None:
-        a = rearrange_tensor(a, a_stride)
-    if b_stride is not None:
-        b = rearrange_tensor(b, b_stride)
     ans = swiglu(a, b)
 
     if sync is not None:
         sync()
 
-    a_tensor = to_tensor(a, lib)
-    b_tensor = to_tensor(b, lib)
+    a, b = [
+        rearrange_if_needed(tensor, stride)
+        for tensor, stride in zip([a, b], [a_stride, b_stride])
+    ]
+    a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
+
     descriptor = infiniopSwiGLUDescriptor_t()
     check_error(
         lib.infiniopCreateSwiGLUDescriptor(
@@ -180,100 +227,42 @@ def test_in_place2(
     )
 
     # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    a_tensor.descriptor.contents.invalidate()
-    b_tensor.descriptor.contents.invalidate()
-
-    check_error(
-        lib.infiniopSwiGLU(
-            descriptor, b_tensor.data, a_tensor.data, b_tensor.data, None
+    for tensor in [a_tensor, b_tensor]:
+        tensor.descriptor.contents.invalidate()
+
+    def lib_swiglu():
+        check_error(
+            lib.infiniopSwiGLU(
+                descriptor, b_tensor.data, a_tensor.data, b_tensor.data, None
+            )
         )
-    )
-
-    assert torch.allclose(b, ans, atol=1e-4, rtol=1e-2)
-
+    lib_swiglu()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(b, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(b, ans, atol=atol, rtol=rtol)
+    print("in-place2 Test passed!")
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: swiglu(a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_swiglu(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
     check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor))
 
 
-def test_cpu(lib, test_cases):
-    device = DeviceEnum.DEVICE_CPU
-    handle = create_handle(lib, device)
-
-    for shape, a_stride, b_stride, c_stride, dtype in test_cases:
-        test_out_of_place(
-            lib, handle, "cpu", shape, a_stride, b_stride, c_stride, dtype
-        )
-        test_in_place1(lib, handle, "cpu", shape, a_stride, b_stride, dtype)
-        test_in_place2(lib, handle, "cpu", shape, a_stride, b_stride, dtype)
-
-    destroy_handle(lib, handle)
-
-
-def test_cuda(lib, test_cases):
-    device = DeviceEnum.DEVICE_CUDA
-    handle = create_handle(lib, device)
-
-    for shape, a_stride, b_stride, c_stride, dtype in test_cases:
-        test_out_of_place(
-            lib, handle, "cuda", shape, a_stride, b_stride, c_stride, dtype
-        )
-        test_in_place1(lib, handle, "cuda", shape, a_stride, b_stride, dtype)
-        test_in_place2(lib, handle, "cuda", shape, a_stride, b_stride, dtype)
-
-    destroy_handle(lib, handle)
-
-
-def test_bang(lib, test_cases):
-    import torch_mlu
-
-    device = DeviceEnum.DEVICE_BANG
-    handle = create_handle(lib, device)
-
-    for shape, a_stride, b_stride, c_stride, dtype in test_cases:
-        test_out_of_place(
-            lib, handle, "mlu", shape, a_stride, b_stride, c_stride, dtype
-        )
-        test_in_place1(lib, handle, "mlu", shape, a_stride, b_stride, dtype)
-        test_in_place2(lib, handle, "mlu", shape, a_stride, b_stride, dtype)
-
-    destroy_handle(lib, handle)
-
-
-def test_ascend(lib, test_cases):
-    import torch_npu
-
-    device = DeviceEnum.DEVICE_ASCEND
-    handle = create_handle(lib, device)
-
-    for shape, a_stride, b_stride, c_stride, dtype in test_cases:
-        test_out_of_place(
-            lib,
-            handle,
-            "npu",
-            shape,
-            a_stride,
-            b_stride,
-            c_stride,
-            dtype,
-            torch.npu.synchronize,
-        )
-        test_in_place1(
-            lib, handle, "npu", shape, a_stride, b_stride, dtype, torch.npu.synchronize
-        )
-        test_in_place2(
-            lib, handle, "npu", shape, a_stride, b_stride, dtype, torch.npu.synchronize
-        )
+def test(lib, handle, torch_device, shape, a_stride, b_stride, c_stride, dtype, sync = None):
+    test_out_of_place(
+        lib, handle, torch_device, shape, a_stride, b_stride, c_stride, dtype, sync
+    )
+    test_in_place1(lib, handle, torch_device, shape, a_stride, b_stride, dtype, sync)
+    test_in_place2(lib, handle, torch_device, shape, a_stride, b_stride, dtype, sync)
 
-    destroy_handle(lib, handle)
 
 
 if __name__ == "__main__":
-    test_cases = [
-        # shape, a_stride, b_stride, c_stride, dtype
-        ((13, 4), None, None, None, torch.float16),
-        ((13, 4), (10, 1), (10, 1), (10, 1), torch.float16),
-        ((16, 5632), None, None, None, torch.float16),
-        ((16, 5632), (13312, 1), (13312, 1), (13312, 1), torch.float16),
-    ]
+    
     args = get_args()
     lib = open_lib()
 
@@ -299,13 +288,13 @@ def test_ascend(lib, test_cases):
     lib.infiniopDestroySwiGLUDescriptor.argtypes = [
         infiniopSwiGLUDescriptor_t,
     ]
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+    
+    for device in get_test_devices(args):
+        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
 
-    if args.cpu:
-        test_cpu(lib, test_cases)
-    if args.cuda:
-        test_cuda(lib, test_cases)
-    if args.bang:
-        test_bang(lib, test_cases)
-    if args.ascend:
-        test_ascend(lib, test_cases)
     print("\033[92mTest passed!\033[0m")

From 04aa18f655f1ef1f259e1a9566f7bd37d042fecd Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Mon, 24 Feb 2025 14:03:30 +0800
Subject: [PATCH 2/5] issue/66: modified format

---
 test/infiniop/causal_softmax.py   |  57 ++++----
 test/infiniop/random_sample.py    |  64 ++++++---
 test/infiniop/rearrange.py        |  39 +++--
 test/infiniop/rms_norm.py         |  63 +++++----
 test/infiniop/rotary_embedding.py |  47 +++---
 test/infiniop/swiglu.py           | 228 ++++++++++--------------------
 6 files changed, 213 insertions(+), 285 deletions(-)

diff --git a/test/infiniop/causal_softmax.py b/test/infiniop/causal_softmax.py
index 64ba65acc..9f6385d9f 100644
--- a/test/infiniop/causal_softmax.py
+++ b/test/infiniop/causal_softmax.py
@@ -23,22 +23,20 @@
 # These are not meant to be imported from other modules
 
 _TEST_CASES = [
-        # x_shape, x_stride
-        ((32, 512), None),
-        ((32, 512), (1024, 1)),
-        ((32, 5, 5), None),
-        ((32, 20, 512), None),
-        ((32, 20, 512), (20480, 512, 1)),  # Ascend 暂不支持非连续
-        ((32, 20, 4, 512), None),
-        ((32, 20, 4, 512), (81920, 2048, 512, 1)),
-    ]
+    # x_shape, x_stride
+    ((32, 512), None),
+    ((32, 512), (1024, 1)),
+    ((32, 5, 5), None),
+    ((32, 20, 512), None),
+    ((32, 20, 512), (20480, 512, 1)),  # Ascend 暂不支持非连续
+]
+
 # Data types used for testing
-_TENSOR_DTYPES = [torch.float16, torch.float32]
+_TENSOR_DTYPES = [torch.float16]
 
 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    torch.float16: {'atol': 0, 'rtol': 1e-2},
-    torch.float32: {'atol': 0, 'rtol': 1e-3},
+    torch.float16: {"atol": 0, "rtol": 1e-2},
 }
 
 DEBUG = False
@@ -46,6 +44,7 @@
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
 
+
 class CausalSoftmaxDescriptor(Structure):
     _fields_ = [("device", c_int32)]
 
@@ -61,39 +60,29 @@ def causal_softmax(x):
     return torch.nn.functional.softmax(masked, dim=-1).to(type)
 
 
-def test(
-    lib, 
-    handle, 
-    torch_device, 
-    x_shape, 
-    x_stride=None, 
-    dtype=torch.float16
-):
+def test(lib, handle, torch_device, x_shape, x_stride=None, dtype=torch.float16):
     print(
         f"Testing CausalSoftmax on {torch_device} with x_shape:{x_shape} x_stride:{x_stride} dtype:{dtype}"
     )
+
     x = torch.rand(x_shape, dtype=dtype).to(torch_device)
 
     ans = causal_softmax(x)
 
-    
     x = rearrange_if_needed(x, x_stride)
-    
+
     x_tensor = to_tensor(x, lib)
 
     descriptor = infiniopCausalSoftmaxDescriptor_t()
     check_error(
         lib.infiniopCreateCausalSoftmaxDescriptor(
-            handle, 
-            ctypes.byref(descriptor),
-            x_tensor.descriptor
+            handle, ctypes.byref(descriptor), x_tensor.descriptor
         )
     )
 
     # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
     x_tensor.descriptor.contents.invalidate()
 
-
     workspace_size = c_uint64(0)
     check_error(
         lib.infiniopGetCausalSoftmaxWorkspaceSize(
@@ -101,6 +90,7 @@ def test(
         )
     )
     workspace = create_workspace(workspace_size.value, x.device)
+
     def lib_causal_softmax():
         check_error(
             lib.infiniopCausalSoftmax(
@@ -111,8 +101,9 @@ def lib_causal_softmax():
                 None,
             )
         )
+
     lib_causal_softmax()
-    
+
     atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
     if DEBUG:
         debug(x, ans, atol=atol, rtol=rtol)
@@ -128,24 +119,23 @@ def lib_causal_softmax():
     check_error(lib.infiniopDestroyCausalSoftmaxDescriptor(descriptor))
 
 
-
-
-
 if __name__ == "__main__":
-    
     args = get_args()
     lib = open_lib()
+
     lib.infiniopCreateCausalSoftmaxDescriptor.restype = c_int32
     lib.infiniopCreateCausalSoftmaxDescriptor.argtypes = [
         infiniopHandle_t,
         POINTER(infiniopCausalSoftmaxDescriptor_t),
         infiniopTensorDescriptor_t,
     ]
+
     lib.infiniopGetCausalSoftmaxWorkspaceSize.restype = c_int32
     lib.infiniopGetCausalSoftmaxWorkspaceSize.argtypes = [
         infiniopCausalSoftmaxDescriptor_t,
         POINTER(c_uint64),
     ]
+
     lib.infiniopCausalSoftmax.restype = c_int32
     lib.infiniopCausalSoftmax.argtypes = [
         infiniopCausalSoftmaxDescriptor_t,
@@ -154,18 +144,19 @@ def lib_causal_softmax():
         c_void_p,
         c_void_p,
     ]
+
     lib.infiniopDestroyCausalSoftmaxDescriptor.restype = c_int32
     lib.infiniopDestroyCausalSoftmaxDescriptor.argtypes = [
         infiniopCausalSoftmaxDescriptor_t,
     ]
+
     # Configure testing options
     DEBUG = args.debug
     PROFILE = args.profile
     NUM_PRERUN = args.num_prerun
     NUM_ITERATIONS = args.num_iterations
-    
+
     for device in get_test_devices(args):
         test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
 
     print("\033[92mTest passed!\033[0m")
-
diff --git a/test/infiniop/random_sample.py b/test/infiniop/random_sample.py
index c2f4f0e5a..c5741e245 100644
--- a/test/infiniop/random_sample.py
+++ b/test/infiniop/random_sample.py
@@ -12,33 +12,40 @@
     create_workspace,
     test_operator,
     get_args,
-    debug,
+    debug_all,
     get_tolerance,
     profile_operation,
+    synchronize_device,
 )
 
 # ==============================================================================
 #  Configuration (Internal Use Only)
 # ==============================================================================
 # These are not meant to be imported from other modules
+
 _TEST_CASES = [
     # voc, random_val, topp, topk, temperature
-        (512, 0.8, 0.8, 3, 0.5),
-        (4096, 0.05, 0.9, 5, 1.0),
-        (16384, 0.15, 0.85, 10, 2.0),
-        (512, 0.08, 0, 3, 0.5),
-        (4096, 0.5, 0.9, 1, 1.0),
-        (16384, 0.15, 0, 1, 2.0),
-        (16384, 0.15, 0, 1, 2.0),
-        (32000, 0.08, 0.8, 50, 1.0),
-        (32000, 0.08, 1.0, 25, 1.0),
-        # (119696, 0.01, 1.0, 100, 1.0),
+    (512, 0.8, 0.8, 3, 0.5),
+    (4096, 0.05, 0.9, 5, 1.0),
+    (16384, 0.15, 0.85, 10, 2.0),
+    (512, 0.08, 0, 3, 0.5),
+    (4096, 0.5, 0.9, 1, 1.0),
+    (16384, 0.15, 0, 1, 2.0),
+    (16384, 0.15, 0, 1, 2.0),
+    (32000, 0.08, 0.8, 50, 1.0),
+    (32000, 0.08, 1.0, 25, 1.0),
+    # (119696, 0.01, 1.0, 100, 1.0),
 ]
 
 # Data types used for testing
-_TENSOR_DTYPES = [torch.float16, torch.float32]
+_TENSOR_DTYPES = [torch.float16]
+
+_TOLERANCE_MAP = {
+    torch.float16: {"atol": 0, "rtol": 0},
+}
 
 
+DEBUG = False
 PROFILE = False
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
@@ -113,6 +120,7 @@ def test(
     x_dtype=torch.float16,
 ):
     print(f"Testing RandomSample on {torch_device} with voc:{voc} dtype:{x_dtype}")
+
     data = torch.arange(voc).float() * 0.0001
     _perm = torch.randperm(voc)
     data = data[_perm].to(x_dtype).to(torch_device)
@@ -122,9 +130,11 @@ def test(
         )
     else:
         ans = random_sample_0(data)
+
     indices = torch.zeros([1], dtype=torch.int64).to(torch_device)
-    x_tensor = to_tensor(data, lib)
-    indices_tensor = to_tensor(indices, lib)
+
+    x_tensor, indices_tensor = [to_tensor(tensor, lib) for tensor in [data, indices]]
+
     indices_tensor.descriptor.contents.dt = U64  # treat int64 as uint64
 
     descriptor = infiniopRandomSampleDescriptor_t()
@@ -148,7 +158,7 @@ def test(
         )
     )
     workspace = create_workspace(workspace_size.value, torch_device)
-    
+
     def lib_random_sample():
         check_error(
             lib.infiniopRandomSample(
@@ -164,11 +174,21 @@ def lib_random_sample():
                 None,
             )
         )
-    if torch_device == "npu":
-        torch.npu.synchronize()
 
+    if torch_device == "npu":
+        synchronize_device(torch_device)
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug_all(
+            (indices[0].type(ans.dtype), data[indices[0]]),
+            (ans, data[ans]),
+            "or",
+            atol=atol,
+            rtol=rtol,
+        )
     assert indices[0].type(ans.dtype) == ans or data[ans] == data[indices[0]]
-    
+
     # Profiling workflow
     if PROFILE:
         # fmt: off
@@ -184,23 +204,23 @@ def lib_random_sample():
     check_error(lib.infiniopDestroyRandomSampleDescriptor(descriptor))
 
 
-
-
 if __name__ == "__main__":
-
     args = get_args()
     lib = open_lib()
+
     lib.infiniopCreateRandomSampleDescriptor.restype = c_int32
     lib.infiniopCreateRandomSampleDescriptor.argtypes = [
         infiniopHandle_t,
         POINTER(infiniopRandomSampleDescriptor_t),
         infiniopTensorDescriptor_t,
     ]
+
     lib.infiniopGetRandomSampleWorkspaceSize.restype = c_int32
     lib.infiniopGetRandomSampleWorkspaceSize.argtypes = [
         infiniopRandomSampleDescriptor_t,
         POINTER(c_uint64),
     ]
+
     lib.infiniopRandomSample.restype = c_int32
     lib.infiniopRandomSample.argtypes = [
         infiniopRandomSampleDescriptor_t,
@@ -214,11 +234,13 @@ def lib_random_sample():
         c_float,
         c_void_p,
     ]
+
     lib.infiniopDestroyRandomSampleDescriptor.restype = c_int32
     lib.infiniopDestroyRandomSampleDescriptor.argtypes = [
         infiniopRandomSampleDescriptor_t,
     ]
 
+    DEBUG = args.debug
     PROFILE = args.profile
     NUM_PRERUN = args.num_prerun
     NUM_ITERATIONS = args.num_iterations
diff --git a/test/infiniop/rearrange.py b/test/infiniop/rearrange.py
index 955ee1719..0e54e9f5f 100644
--- a/test/infiniop/rearrange.py
+++ b/test/infiniop/rearrange.py
@@ -23,13 +23,13 @@
 # These are not meant to be imported from other modules
 _TEST_CASES = [
     # ((src_shape, src_stride), (dst_shape, dst_stride))
-        (((2, 4, 32), None), ((2, 4, 32), (256, 64, 1))),
-        (((32, 6, 64), (64, 2560, 1)), ((32, 6, 64), None)),
-        (((4, 6, 64), (64, 2560, 1)), ((4, 6, 64), (131072, 64, 1))),
-        (((1, 32, 64), (2048, 64, 1)), ((1, 32, 64), (2048, 64, 1))),
-        (((32, 1, 64), (64, 2560, 1)), ((32, 1, 64), (64, 64, 1))),
-        (((4, 1, 64), (64, 2560, 1)), ((4, 1, 64), (64, 11264, 1))),
-        (((64,), (1,)), ((64,), (1,))),
+    (((2, 4, 32), None), ((2, 4, 32), (256, 64, 1))),
+    (((32, 6, 64), (64, 2560, 1)), ((32, 6, 64), None)),
+    (((4, 6, 64), (64, 2560, 1)), ((4, 6, 64), (131072, 64, 1))),
+    (((1, 32, 64), (2048, 64, 1)), ((1, 32, 64), (2048, 64, 1))),
+    (((32, 1, 64), (64, 2560, 1)), ((32, 1, 64), (64, 64, 1))),
+    (((4, 1, 64), (64, 2560, 1)), ((4, 1, 64), (64, 11264, 1))),
+    (((64,), (1,)), ((64,), (1,))),
 ]
 
 # Data types used for testing
@@ -37,8 +37,8 @@
 
 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    torch.float16: {"atol": 0, "rtol": 1e-3},
-    torch.float32: {"atol": 0, "rtol": 1e-3},
+    torch.float16: {"atol": 0, "rtol": 0},
+    torch.float32: {"atol": 0, "rtol": 0},
 }
 
 DEBUG = False
@@ -47,7 +47,6 @@
 NUM_ITERATIONS = 1000
 
 
-
 class RerrangeDescriptor(Structure):
     _fields_ = [("device", c_int32)]
 
@@ -68,16 +67,16 @@ def test(
     print(
         f"Testing Rerrange on {torch_device} with x_shape:{x_shape} x_stride:{x_stride} y_shape:{y_shape} y_stride:{y_stride} x_dtype:{x_dtype}"
     )
+
     x = torch.rand(x_shape, dtype=x_dtype).to(torch_device)
     y = torch.zeros(y_shape, dtype=x_dtype).to(torch_device)
-    
+
     x, y = [
         rearrange_if_needed(tensor, stride)
         for tensor, stride in zip([x, y], [x_stride, y_stride])
     ]
     x_tensor, y_tensor = [to_tensor(tensor, lib) for tensor in [x, y]]
 
-
     descriptor = infiniopRearrangeDescriptor_t()
     check_error(
         lib.infiniopCreateRearrangeDescriptor(
@@ -91,15 +90,11 @@ def test(
 
     def lib_rearrange():
         check_error(
-            lib.infiniopRearrange(
-                descriptor, 
-                y_tensor.data, 
-                x_tensor.data, 
-                None
-            )
+            lib.infiniopRearrange(descriptor, y_tensor.data, x_tensor.data, None)
         )
+
     lib_rearrange()
-    
+
     # Validate results
     atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
     if DEBUG:
@@ -116,8 +111,6 @@ def lib_rearrange():
     check_error(lib.infiniopDestroyRearrangeDescriptor(descriptor))
 
 
-
-
 if __name__ == "__main__":
     args = get_args()
     lib = open_lib()
@@ -129,6 +122,7 @@ def lib_rearrange():
         infiniopTensorDescriptor_t,
         infiniopTensorDescriptor_t,
     ]
+
     lib.infiniopRearrange.restype = c_int32
     lib.infiniopRearrange.argtypes = [
         infiniopRearrangeDescriptor_t,
@@ -136,9 +130,10 @@ def lib_rearrange():
         c_void_p,
         c_void_p,
     ]
+
     lib.infiniopDestroyRearrangeDescriptor.restype = c_int32
     lib.infiniopDestroyRearrangeDescriptor.argtypes = [infiniopRearrangeDescriptor_t]
-    
+
     # Configure testing options
     DEBUG = args.debug
     PROFILE = args.profile
diff --git a/test/infiniop/rms_norm.py b/test/infiniop/rms_norm.py
index 0adf37241..b60c44922 100644
--- a/test/infiniop/rms_norm.py
+++ b/test/infiniop/rms_norm.py
@@ -23,21 +23,20 @@
 #  Configuration (Internal Use Only)
 # ==============================================================================
 # These are not meant to be imported from other modules
-
 _TEST_CASES = [
     # y_shape, x_shape, w_shape, y_stride, x_stride, w_dtype
-    ((16, 2048), (16, 2048), (2048,), None, None,torch.float32),
+    ((16, 2048), (16, 2048), (2048,), None, None, torch.float32),
     ((16, 2048), (16, 2048), (2048,), None, None, torch.float16),
-    ((16, 2048), (16, 2048), (2048,), (4096, 1), (4096, 1),torch.float32),
+    ((16, 2048), (16, 2048), (2048,), (4096, 1), (4096, 1), torch.float32),
     ((16, 2048), (16, 2048), (2048,), (4096, 1), (4096, 1), torch.float16),
 ]
+
 # x types used for testing
-_TENSOR_DTYPES = [torch.float16, torch.float32]
+_TENSOR_DTYPES = [torch.float16]
 
 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    torch.float16: {"atol": 0, "rtol": 1e-2},
-    torch.float32: {"atol": 0, "rtol": 1e-3},
+    torch.float16: {"atol": 1e-3, "rtol": 1e-3},
 }
 
 DEBUG = False
@@ -45,12 +44,14 @@
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
 
+
 class RMSNormDescriptor(Structure):
     _fields_ = [("device", c_int32)]
 
 
 infiniopRMSNormDescriptor_t = POINTER(RMSNormDescriptor)
 
+
 def rms_norm(x, w, eps):
     input_dtype = x.dtype
     hidden_states = x.to(torch.float32)
@@ -60,18 +61,21 @@ def rms_norm(x, w, eps):
 
 
 def test(
-    lib, 
-    handle, 
-    torch_device, 
-    y_shape, 
-    x_shape, 
-    w_shape, 
+    lib,
+    handle,
+    torch_device,
+    y_shape,
+    x_shape,
+    w_shape,
     y_stride,
     x_stride,
-    dtype=torch.float16, 
-    w_dtype=torch.float16):
-    print(f"Testing RMS_Norm on {torch_device} with y_shape:{y_shape} x_shape:{x_shape} w_shape:{w_shape}"
-        f" dtype:{dtype} w_dtype:{w_dtype}")
+    dtype=torch.float16,
+    w_dtype=torch.float16,
+):
+    print(
+        f"Testing RMS_Norm on {torch_device} with y_shape:{y_shape} x_shape:{x_shape} w_shape:{w_shape}"
+        f" dtype:{dtype} w_dtype:{w_dtype}"
+    )
 
     y = torch.zeros(y_shape, dtype=dtype).to(torch_device)
     x = torch.rand(x_shape, dtype=dtype).to(torch_device)
@@ -80,18 +84,23 @@ def test(
     eps = 1e-5
     ans = rms_norm(x, w, eps)
 
-    x = rearrange_if_needed(x, x_stride)
-    y = rearrange_if_needed(y, y_stride)
+    x, y = [
+        rearrange_if_needed(tensor, stride)
+        for tensor, stride in zip([x, y], [x_stride, y_stride])
+    ]
 
     x_tensor, y_tensor, w_tensor = [to_tensor(tensor, lib) for tensor in [x, y, w]]
 
     descriptor = infiniopRMSNormDescriptor_t()
-    w_dataType = 0 if w_dtype==torch.float16 else 1
 
     check_error(
         lib.infiniopCreateRMSNormDescriptor(
-            handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor,
-            w_tensor.descriptor, eps
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+            w_tensor.descriptor,
+            eps,
         )
     )
 
@@ -101,11 +110,10 @@ def test(
 
     workspace_size = c_uint64(0)
     check_error(
-        lib.infiniopGetRMSNormWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
+        lib.infiniopGetRMSNormWorkspaceSize(descriptor, ctypes.byref(workspace_size))
     )
     workspace = create_workspace(workspace_size.value, y.device)
+
     def lib_rms_norm():
         check_error(
             lib.infiniopRMSNorm(
@@ -134,12 +142,10 @@ def lib_rms_norm():
     check_error(lib.infiniopDestroyRMSNormDescriptor(descriptor))
 
 
-
-
 if __name__ == "__main__":
-    
     args = get_args()
     lib = open_lib()
+
     lib.infiniopCreateRMSNormDescriptor.restype = c_int32
     lib.infiniopCreateRMSNormDescriptor.argtypes = [
         infiniopHandle_t,
@@ -166,6 +172,7 @@ def lib_rms_norm():
         c_void_p,
         c_void_p,
     ]
+
     lib.infiniopDestroyRMSNormDescriptor.restype = c_int32
     lib.infiniopDestroyRMSNormDescriptor.argtypes = [
         infiniopRMSNormDescriptor_t,
@@ -182,5 +189,3 @@ def lib_rms_norm():
         test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
 
     print("\033[92mTest passed!\033[0m")
-
-
diff --git a/test/infiniop/rotary_embedding.py b/test/infiniop/rotary_embedding.py
index 88fde17d0..64237c67f 100644
--- a/test/infiniop/rotary_embedding.py
+++ b/test/infiniop/rotary_embedding.py
@@ -15,6 +15,7 @@
     debug,
     get_tolerance,
     profile_operation,
+    synchronize_device,
 )
 
 # ==============================================================================
@@ -23,22 +24,21 @@
 # These are not meant to be imported from other modules
 _TEST_CASES = [
     # (t_shape, t_strides)
-        ((1, 32, 128), None),
-        ((1, 32, 64), None),
-        # 昇腾暂不满足这个用例，最后一维度 <=32 会有问题，可能与其核心
-        # 接口 GatherMask 的内部实现相关，目前 48 64 128 都可以支持
-        ((4, 1, 32), None),
-        ((1, 32, 128), None),
-        ((3, 32, 128), (8000, 200, 1)),
+    ((1, 32, 128), None),
+    ((1, 32, 64), None),
+    # 昇腾暂不满足这个用例，最后一维度 <=32 会有问题，可能与其核心
+    # 接口 GatherMask 的内部实现相关，目前 48 64 128 都可以支持
+    ((4, 1, 32), None),
+    ((1, 32, 128), None),
+    ((3, 32, 128), (8000, 200, 1)),
 ]
 
 # Data types used for testing
-_TENSOR_DTYPES = [torch.float16, torch.float32]
+_TENSOR_DTYPES = [torch.float16]
 
 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    torch.float16: {"atol": 0, "rtol": 1e-2},
-    torch.float32: {"atol": 0, "rtol": 1e-3},
+    torch.float16: {"atol": 1e-4, "rtol": 1e-2},
 }
 
 DEBUG = False
@@ -47,7 +47,6 @@
 NUM_ITERATIONS = 1000
 
 
-
 class RoPEDescriptor(Structure):
     _fields_ = [("device", c_int32)]
 
@@ -96,14 +95,7 @@ def sin_cos_table(max_seq_len, dim, torch_device, theta):
     return torch.sin(angles), torch.cos(angles)
 
 
-def test(
-    lib, 
-    handle, 
-    torch_device, 
-    shape, 
-    strides=None, 
-    dtype=torch.float16
-):
+def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16):
     print(
         f"Testing Rotary Positional Embedding on {torch_device} with shape:{shape} strides:{strides} and dtype:{dtype}"
     )
@@ -126,14 +118,15 @@ def test(
     # 2x table length for test
     sin_table, cos_table = sin_cos_table(t.shape[0] * 2, t.shape[2], t.device, theta)
 
-    t_tensor, sin_table_tensor, cos_table_tensor = [to_tensor(tensor, lib) for tensor in [t, sin_table, cos_table]]
-    
+    t_tensor, sin_table_tensor, cos_table_tensor = [
+        to_tensor(tensor, lib) for tensor in [t, sin_table, cos_table]
+    ]
+
     pos_tensor = to_tensor(pos[: t.shape[0]], lib)
     pos_tensor.descriptor.contents.dtype = InfiniDtype.U64
-    
 
     if torch_device == "npu":
-        torch.npu.synchronize()
+        synchronize_device(torch_device)
 
     check_error(
         lib.infiniopCreateRoPEDescriptor(
@@ -171,11 +164,12 @@ def lib_rope():
         )
 
     lib_rope()
+
     atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
     if DEBUG:
         debug(t, ans, atol=atol, rtol=rtol)
     assert torch.allclose(t, ans, atol=atol, rtol=rtol)
-    
+
     if PROFILE:
         profile_operation(
             "PyTorch",
@@ -194,6 +188,7 @@ def lib_rope():
 if __name__ == "__main__":
     args = get_args()
     lib = open_lib()
+
     lib.infiniopCreateRoPEDescriptor.restype = c_int32
     lib.infiniopCreateRoPEDescriptor.argtypes = [
         infiniopHandle_t,
@@ -203,11 +198,13 @@ def lib_rope():
         infiniopTensorDescriptor_t,
         infiniopTensorDescriptor_t,
     ]
+
     lib.infiniopGetRoPEWorkspaceSize.restype = c_int32
     lib.infiniopGetRoPEWorkspaceSize.argtypes = [
         infiniopRoPEDescriptor_t,
         POINTER(c_uint64),
     ]
+
     lib.infiniopRoPE.restype = c_int32
     lib.infiniopRoPE.argtypes = [
         infiniopRoPEDescriptor_t,
@@ -219,10 +216,12 @@ def lib_rope():
         c_void_p,
         c_void_p,
     ]
+
     lib.infiniopDestroyRoPEDescriptor.restype = c_int32
     lib.infiniopDestroyRoPEDescriptor.argtypes = [
         infiniopRoPEDescriptor_t,
     ]
+
     # Configure testing options
     DEBUG = args.debug
     PROFILE = args.profile
diff --git a/test/infiniop/swiglu.py b/test/infiniop/swiglu.py
index dd5608ab5..fd933f8d9 100644
--- a/test/infiniop/swiglu.py
+++ b/test/infiniop/swiglu.py
@@ -16,29 +16,64 @@
     get_tolerance,
     profile_operation,
 )
+from enum import Enum, auto
 
 # ==============================================================================
 #  Configuration (Internal Use Only)
 # ==============================================================================
 # These are not meant to be imported from other modules
 _TEST_CASES = [
-    # shape, a_stride, b_stride, c_stride
-    ((13, 4), None, None, None),
-    ((13, 4), (10, 1), (10, 1), (10, 1)),
-    ((13, 4, 4), None, None, None),
-    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
-    ((16, 5632), None, None, None),
-    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
-    ((4, 4, 5632), None, None, None),
-    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+    # shape, a_stride, b_stride, c_stride, inplace
+    ((13, 4), None, None, None, Inplace.OUT_OF_PLACE),
+    ((13, 4), None, None, None, Inplace.INPLACE_A),
+    ((13, 4), None, None, None, Inplace.INPLACE_B),
+    ((13, 4), (10, 1), (10, 1), (10, 1), Inplace.OUT_OF_PLACE),
+    ((13, 4), (10, 1), (10, 1), (10, 1), Inplace.INPLACE_A),
+    ((13, 4), (10, 1), (10, 1), (10, 1), Inplace.INPLACE_B),
+    ((13, 4, 4), None, None, None, Inplace.OUT_OF_PLACE),
+    ((13, 4, 4), None, None, None, Inplace.INPLACE_A),
+    ((13, 4, 4), None, None, None, Inplace.INPLACE_B),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1), Inplace.OUT_OF_PLACE),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1), Inplace.INPLACE_A),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1), Inplace.INPLACE_B),
+    ((16, 5632), None, None, None, Inplace.OUT_OF_PLACE),
+    ((16, 5632), None, None, None, Inplace.INPLACE_A),
+    ((16, 5632), None, None, None, Inplace.INPLACE_B),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1), Inplace.OUT_OF_PLACE),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1), Inplace.INPLACE_A),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1), Inplace.INPLACE_B),
+    ((4, 4, 5632), None, None, None, Inplace.OUT_OF_PLACE),
+    ((4, 4, 5632), None, None, None, Inplace.INPLACE_A),
+    ((4, 4, 5632), None, None, None, Inplace.INPLACE_B),
+    (
+        (4, 4, 5632),
+        (45056, 5632, 1),
+        (45056, 5632, 1),
+        (45056, 5632, 1),
+        Inplace.OUT_OF_PLACE,
+    ),
+    (
+        (4, 4, 5632),
+        (45056, 5632, 1),
+        (45056, 5632, 1),
+        (45056, 5632, 1),
+        Inplace.INPLACE_A,
+    ),
+    (
+        (4, 4, 5632),
+        (45056, 5632, 1),
+        (45056, 5632, 1),
+        (45056, 5632, 1),
+        Inplace.INPLACE_B,
+    ),
 ]
+
 # Data types used for testing
-_TENSOR_DTYPES = [torch.float16, torch.float32]
+_TENSOR_DTYPES = [torch.float16]
 
 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    torch.float16: {'atol': 0, 'rtol': 1e-2},
-    torch.float32: {'atol': 0, 'rtol': 1e-3},
+    torch.float16: {"atol": 1e-4, "rtol": 1e-2},
 }
 
 DEBUG = False
@@ -46,6 +81,13 @@
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
 
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
 class SwiGLUDescriptor(Structure):
     _fields_ = [("device", c_int32)]
 
@@ -54,11 +96,10 @@ class SwiGLUDescriptor(Structure):
 
 
 def swiglu(a, b):
-
     return a * b / (1 + torch.exp(-b.float()).to(b.dtype))
 
 
-def test_out_of_place(
+def test(
     lib,
     handle,
     torch_device,
@@ -66,15 +107,21 @@ def test_out_of_place(
     a_stride=None,
     b_stride=None,
     c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
     dtype=torch.float16,
     sync=None,
 ):
     print(
         f"Testing SwiGLU on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} dtype:{dtype}"
     )
+
     a = torch.rand(shape, dtype=dtype).to(torch_device)
     b = torch.rand(shape, dtype=dtype).to(torch_device)
-    c = torch.rand(shape, dtype=dtype).to(torch_device)
+    c = (
+        torch.rand(c_shape, dtype=tensor_dtype).to(torch_device)
+        if inplace == Inplace.OUT_OF_PLACE
+        else (a if inplace == Inplace.INPLACE_A else b)
+    )
 
     ans = swiglu(a, b)
 
@@ -82,9 +129,12 @@ def test_out_of_place(
         rearrange_if_needed(tensor, stride)
         for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_stride])
     ]
-    a_tensor, b_tensor, c_tensor = [to_tensor(tensor, lib) for tensor in [a, b, c]]
-
-
+    a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
+    c_tensor = (
+        to_tensor(c, lib)
+        if inplace == Inplace.OUT_OF_PLACE
+        else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
+    )
     if sync is not None:
         sync()
 
@@ -106,13 +156,10 @@ def test_out_of_place(
     def lib_swiglu():
         check_error(
             lib.infiniopSwiGLU(
-                descriptor, 
-                c_tensor.data, 
-                a_tensor.data, 
-                b_tensor.data, 
-                None
+                descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
             )
         )
+
     lib_swiglu()
 
     atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
@@ -130,139 +177,7 @@ def lib_swiglu():
     check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor))
 
 
-def test_in_place1(
-    lib,
-    handle,
-    torch_device,
-    shape,
-    a_stride=None,
-    b_stride=None,
-    dtype=torch.float16,
-    sync=None,
-):
-    a = torch.rand(shape, dtype=dtype).to(torch_device)
-    b = torch.rand(shape, dtype=dtype).to(torch_device)
-
-    ans = swiglu(a, b)
-
-    if sync is not None:
-        sync()
-
-    a, b = [
-        rearrange_if_needed(tensor, stride)
-        for tensor, stride in zip([a, b], [a_stride, b_stride])
-    ]
-    a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
-
-    descriptor = infiniopSwiGLUDescriptor_t()
-    
-    check_error(
-        lib.infiniopCreateSwiGLUDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            a_tensor.descriptor,
-            a_tensor.descriptor,
-            b_tensor.descriptor,
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [a_tensor, b_tensor]:
-        tensor.descriptor.contents.invalidate()
-    def lib_swiglu():
-        check_error(
-            lib.infiniopSwiGLU(
-                descriptor, a_tensor.data, a_tensor.data, b_tensor.data, None
-            )
-        )
-    lib_swiglu()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(a, ans, atol=atol, rtol=rtol)
-    assert torch.allclose(a, ans, atol=atol, rtol=rtol)
-    print("in-place1 Test passed!")
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: swiglu(a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_swiglu(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-    check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor))
-
-
-def test_in_place2(
-    lib,
-    handle,
-    torch_device,
-    shape,
-    a_stride=None,
-    b_stride=None,
-    dtype=torch.float16,
-    sync=None,
-):
-    a = torch.rand(shape, dtype=dtype).to(torch_device)
-    b = torch.rand(shape, dtype=dtype).to(torch_device)
-
-    ans = swiglu(a, b)
-
-    if sync is not None:
-        sync()
-
-    a, b = [
-        rearrange_if_needed(tensor, stride)
-        for tensor, stride in zip([a, b], [a_stride, b_stride])
-    ]
-    a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
-
-    descriptor = infiniopSwiGLUDescriptor_t()
-    check_error(
-        lib.infiniopCreateSwiGLUDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            b_tensor.descriptor,
-            a_tensor.descriptor,
-            b_tensor.descriptor,
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [a_tensor, b_tensor]:
-        tensor.descriptor.contents.invalidate()
-
-    def lib_swiglu():
-        check_error(
-            lib.infiniopSwiGLU(
-                descriptor, b_tensor.data, a_tensor.data, b_tensor.data, None
-            )
-        )
-    lib_swiglu()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(b, ans, atol=atol, rtol=rtol)
-    assert torch.allclose(b, ans, atol=atol, rtol=rtol)
-    print("in-place2 Test passed!")
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: swiglu(a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_swiglu(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-    check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor))
-
-
-def test(lib, handle, torch_device, shape, a_stride, b_stride, c_stride, dtype, sync = None):
-    test_out_of_place(
-        lib, handle, torch_device, shape, a_stride, b_stride, c_stride, dtype, sync
-    )
-    test_in_place1(lib, handle, torch_device, shape, a_stride, b_stride, dtype, sync)
-    test_in_place2(lib, handle, torch_device, shape, a_stride, b_stride, dtype, sync)
-
-
-
 if __name__ == "__main__":
-    
     args = get_args()
     lib = open_lib()
 
@@ -288,12 +203,13 @@ def test(lib, handle, torch_device, shape, a_stride, b_stride, c_stride, dtype,
     lib.infiniopDestroySwiGLUDescriptor.argtypes = [
         infiniopSwiGLUDescriptor_t,
     ]
+
     # Configure testing options
     DEBUG = args.debug
     PROFILE = args.profile
     NUM_PRERUN = args.num_prerun
     NUM_ITERATIONS = args.num_iterations
-    
+
     for device in get_test_devices(args):
         test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
 

From 08a29c28bfd926b3142e355fd6016b08ad8d4014 Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Mon, 24 Feb 2025 16:34:13 +0800
Subject: [PATCH 3/5] issue/66: modified random sample test function

---
 test/infiniop/causal_softmax.py |   1 -
 test/infiniop/random_sample.py  | 102 +++++++++++++++-----------------
 2 files changed, 49 insertions(+), 54 deletions(-)

diff --git a/test/infiniop/causal_softmax.py b/test/infiniop/causal_softmax.py
index 9f6385d9f..fd3f63957 100644
--- a/test/infiniop/causal_softmax.py
+++ b/test/infiniop/causal_softmax.py
@@ -21,7 +21,6 @@
 #  Configuration (Internal Use Only)
 # ==============================================================================
 # These are not meant to be imported from other modules
-
 _TEST_CASES = [
     # x_shape, x_stride
     ((32, 512), None),
diff --git a/test/infiniop/random_sample.py b/test/infiniop/random_sample.py
index c5741e245..fc0694184 100644
--- a/test/infiniop/random_sample.py
+++ b/test/infiniop/random_sample.py
@@ -22,7 +22,6 @@
 #  Configuration (Internal Use Only)
 # ==============================================================================
 # These are not meant to be imported from other modules
-
 _TEST_CASES = [
     # voc, random_val, topp, topk, temperature
     (512, 0.8, 0.8, 3, 0.5),
@@ -59,53 +58,52 @@ class RandomSampleDescriptor(Structure):
 
 
 def random_sample(data, random_val, topp, topk, voc, temperature, torch_device):
-    indices = torch.zeros([topk], dtype=torch.int64)
-    dataNp = data.clone().detach()
-    sorted_indices = torch.arange(voc)
-
-    for i in range(topk):
-        for j in range(i + 1, voc):
-            if dataNp[i] < dataNp[j]:
-                tmp = dataNp[i].clone().detach()
-                dataNp[i] = dataNp[j].clone().detach()
-                dataNp[j] = tmp
-
-                tmpInd = sorted_indices[i].clone().detach()
-                sorted_indices[i] = sorted_indices[j].clone().detach()
-                sorted_indices[j] = tmpInd
-
-    # sorted_indices = torch.argsort(dataNp, descending=True)
-    indices = sorted_indices[:topk]
-
-    dataNp = dataNp[sorted_indices]
-
-    globalM = dataNp[0]
-    dataNp = (dataNp - globalM) / temperature
-    dataNp = torch.softmax(dataNp.float(), dim=0)
-    sum_s = 0
-    for end in range(topk):
-        sum_s += dataNp[end]
-        if sum_s >= topp:
-            break
-    if end < topk - 1:
-        end += 1
+    if topp > 0 and topk > 1:
+        indices = torch.zeros([topk], dtype=torch.int64)
+        dataNp = data.clone().detach()
+        sorted_indices = torch.arange(voc)
+
+        for i in range(topk):
+            for j in range(i + 1, voc):
+                if dataNp[i] < dataNp[j]:
+                    tmp = dataNp[i].clone().detach()
+                    dataNp[i] = dataNp[j].clone().detach()
+                    dataNp[j] = tmp
+
+                    tmpInd = sorted_indices[i].clone().detach()
+                    sorted_indices[i] = sorted_indices[j].clone().detach()
+                    sorted_indices[j] = tmpInd
+
+        # sorted_indices = torch.argsort(dataNp, descending=True)
+        indices = sorted_indices[:topk]
+
+        dataNp = dataNp[sorted_indices]
+
+        globalM = dataNp[0]
+        dataNp = (dataNp - globalM) / temperature
+        dataNp = torch.softmax(dataNp.float(), dim=0)
+        sum_s = 0
+        for end in range(topk):
+            sum_s += dataNp[end]
+            if sum_s >= topp:
+                break
+        if end < topk - 1:
+            end += 1
+        else:
+            end = topk
+
+        sum_s = 0
+        for i in range(end):
+            sum_s += dataNp[i]
+        random_val *= sum_s
+
+        sum_s = 0
+        for i in range(end):
+            sum_s += dataNp[i]
+            if random_val < sum_s:
+                return indices[i]
     else:
-        end = topk
-
-    sum_s = 0
-    for i in range(end):
-        sum_s += dataNp[i]
-    random_val *= sum_s
-
-    sum_s = 0
-    for i in range(end):
-        sum_s += dataNp[i]
-        if random_val < sum_s:
-            return indices[i]
-
-
-def random_sample_0(data):
-    return torch.argmax(data)
+        return torch.argmax(data)
 
 
 def test(
@@ -124,12 +122,10 @@ def test(
     data = torch.arange(voc).float() * 0.0001
     _perm = torch.randperm(voc)
     data = data[_perm].to(x_dtype).to(torch_device)
-    if topp > 0 and topk > 1:
-        ans = random_sample(
-            data.to("cpu"), random_val, topp, topk, voc, temperature, "cpu"
-        )
-    else:
-        ans = random_sample_0(data)
+
+    ans = random_sample(
+        data, random_val, topp, topk, voc, temperature, torch_device
+    )  # 这个函数在device速度可能会很慢，可以通过data.to("cpu")方式加快计算过程
 
     indices = torch.zeros([1], dtype=torch.int64).to(torch_device)
 

From c0811ed4c6b112fa08fa9567b7e22ab01c7721c9 Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Tue, 25 Feb 2025 13:28:13 +0800
Subject: [PATCH 4/5] issue/66: modified random_sample, swiglu, rms_norm, test

---
 test/infiniop/random_sample.py |  8 ++---
 test/infiniop/rms_norm.py      |  1 +
 test/infiniop/swiglu.py        | 66 ++++++++++++----------------------
 3 files changed, 25 insertions(+), 50 deletions(-)

diff --git a/test/infiniop/random_sample.py b/test/infiniop/random_sample.py
index fc0694184..9584d4925 100644
--- a/test/infiniop/random_sample.py
+++ b/test/infiniop/random_sample.py
@@ -188,13 +188,9 @@ def lib_random_sample():
     # Profiling workflow
     if PROFILE:
         # fmt: off
-        if topp > 0 and topk > 1:
-            profile_operation("PyTorch", lambda: random_sample(
-                data.to("cpu"), random_val, topp, topk, voc, temperature, "cpu"
+        profile_operation("PyTorch", lambda: random_sample(
+                data, random_val, topp, topk, voc, temperature, torch_device
             ), torch_device, NUM_PRERUN, NUM_ITERATIONS)
-        else:
-            profile_operation("PyTorch", lambda: random_sample_0(data), torch_device, NUM_PRERUN, NUM_ITERATIONS)
-        
         profile_operation("    lib", lambda: lib_random_sample(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
         # fmt: on
     check_error(lib.infiniopDestroyRandomSampleDescriptor(descriptor))
diff --git a/test/infiniop/rms_norm.py b/test/infiniop/rms_norm.py
index b60c44922..eaab61fd0 100644
--- a/test/infiniop/rms_norm.py
+++ b/test/infiniop/rms_norm.py
@@ -133,6 +133,7 @@ def lib_rms_norm():
     if DEBUG:
         debug(y, ans, atol=atol, rtol=rtol)
     assert torch.allclose(y, ans, atol=atol, rtol=rtol)
+
     # Profiling workflow
     if PROFILE:
         # fmt: off
diff --git a/test/infiniop/swiglu.py b/test/infiniop/swiglu.py
index fd933f8d9..427db4d82 100644
--- a/test/infiniop/swiglu.py
+++ b/test/infiniop/swiglu.py
@@ -22,50 +22,29 @@
 #  Configuration (Internal Use Only)
 # ==============================================================================
 # These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    "Inplace.OUT_OF_PLACE",
+    "Inplace.INPLACE_A",
+    "Inplace.INPLACE_B",
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
 _TEST_CASES = [
-    # shape, a_stride, b_stride, c_stride, inplace
-    ((13, 4), None, None, None, Inplace.OUT_OF_PLACE),
-    ((13, 4), None, None, None, Inplace.INPLACE_A),
-    ((13, 4), None, None, None, Inplace.INPLACE_B),
-    ((13, 4), (10, 1), (10, 1), (10, 1), Inplace.OUT_OF_PLACE),
-    ((13, 4), (10, 1), (10, 1), (10, 1), Inplace.INPLACE_A),
-    ((13, 4), (10, 1), (10, 1), (10, 1), Inplace.INPLACE_B),
-    ((13, 4, 4), None, None, None, Inplace.OUT_OF_PLACE),
-    ((13, 4, 4), None, None, None, Inplace.INPLACE_A),
-    ((13, 4, 4), None, None, None, Inplace.INPLACE_B),
-    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1), Inplace.OUT_OF_PLACE),
-    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1), Inplace.INPLACE_A),
-    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1), Inplace.INPLACE_B),
-    ((16, 5632), None, None, None, Inplace.OUT_OF_PLACE),
-    ((16, 5632), None, None, None, Inplace.INPLACE_A),
-    ((16, 5632), None, None, None, Inplace.INPLACE_B),
-    ((16, 5632), (13312, 1), (13312, 1), (13312, 1), Inplace.OUT_OF_PLACE),
-    ((16, 5632), (13312, 1), (13312, 1), (13312, 1), Inplace.INPLACE_A),
-    ((16, 5632), (13312, 1), (13312, 1), (13312, 1), Inplace.INPLACE_B),
-    ((4, 4, 5632), None, None, None, Inplace.OUT_OF_PLACE),
-    ((4, 4, 5632), None, None, None, Inplace.INPLACE_A),
-    ((4, 4, 5632), None, None, None, Inplace.INPLACE_B),
-    (
-        (4, 4, 5632),
-        (45056, 5632, 1),
-        (45056, 5632, 1),
-        (45056, 5632, 1),
-        Inplace.OUT_OF_PLACE,
-    ),
-    (
-        (4, 4, 5632),
-        (45056, 5632, 1),
-        (45056, 5632, 1),
-        (45056, 5632, 1),
-        Inplace.INPLACE_A,
-    ),
-    (
-        (4, 4, 5632),
-        (45056, 5632, 1),
-        (45056, 5632, 1),
-        (45056, 5632, 1),
-        Inplace.INPLACE_B,
-    ),
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
 ]
 
 # Data types used for testing
@@ -166,7 +145,6 @@ def lib_swiglu():
     if DEBUG:
         debug(c, ans, atol=atol, rtol=rtol)
     assert torch.allclose(c, ans, atol=atol, rtol=rtol)
-    print("out-of-place Test passed!")
 
     # Profiling workflow
     if PROFILE:

From 642e8de0a9ea3e9c9180e8a1905dc34f6b748ad3 Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Tue, 25 Feb 2025 14:09:01 +0800
Subject: [PATCH 5/5] issue/66: add lib_random_sample()

---
 test/infiniop/random_sample.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/infiniop/random_sample.py b/test/infiniop/random_sample.py
index 9584d4925..8ca1fdf97 100644
--- a/test/infiniop/random_sample.py
+++ b/test/infiniop/random_sample.py
@@ -171,6 +171,8 @@ def lib_random_sample():
             )
         )
 
+    lib_random_sample()
+
     if torch_device == "npu":
         synchronize_device(torch_device)