From ca2f34cf4a5f082e0448dd4cdd7b194e1aab2d89 Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Thu, 20 Feb 2025 14:05:09 +0800 Subject: [PATCH 1/5] issue/66: modified test py --- test/infiniop/causal_softmax.py | 179 ++++++++++--------- test/infiniop/random_sample.py | 162 ++++++++--------- test/infiniop/rearrange.py | 172 +++++++++--------- test/infiniop/rms_norm.py | 201 ++++++++++----------- test/infiniop/rotary_embedding.py | 80 +++++---- test/infiniop/swiglu.py | 287 ++++++++++++++---------------- 6 files changed, 544 insertions(+), 537 deletions(-) diff --git a/test/infiniop/causal_softmax.py b/test/infiniop/causal_softmax.py index a5c66bfbb..64ba65acc 100644 --- a/test/infiniop/causal_softmax.py +++ b/test/infiniop/causal_softmax.py @@ -1,26 +1,50 @@ -from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p +import torch import ctypes -import sys -import os - - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) -from operatorspy import ( - open_lib, - to_tensor, - DeviceEnum, +from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float +from libinfiniop import ( infiniopHandle_t, infiniopTensorDescriptor_t, - create_handle, - destroy_handle, + open_lib, + to_tensor, + get_test_devices, check_error, - rearrange_tensor, + rearrange_if_needed, create_workspace, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, ) -from operatorspy.tests.test_utils import get_args -import torch +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES = [ + # x_shape, x_stride + ((32, 512), None), + ((32, 512), (1024, 1)), + ((32, 5, 5), None), + ((32, 20, 512), None), + ((32, 20, 512), (20480, 512, 1)), # Ascend 暂不支持非连续 + ((32, 20, 4, 512), None), + ((32, 20, 4, 512), (81920, 2048, 512, 1)), + ] +# Data types used for testing +_TENSOR_DTYPES = [torch.float16, torch.float32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + torch.float16: {'atol': 0, 'rtol': 1e-2}, + torch.float32: {'atol': 0, 'rtol': 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 class CausalSoftmaxDescriptor(Structure): _fields_ = [("device", c_int32)] @@ -37,88 +61,78 @@ def causal_softmax(x): return torch.nn.functional.softmax(masked, dim=-1).to(type) -def test(lib, handle, torch_device, x_shape, x_stride=None, x_dtype=torch.float16): +def test( + lib, + handle, + torch_device, + x_shape, + x_stride=None, + dtype=torch.float16 +): print( - f"Testing CausalSoftmax on {torch_device} with x_shape:{x_shape} x_stride:{x_stride} dtype:{x_dtype}" + f"Testing CausalSoftmax on {torch_device} with x_shape:{x_shape} x_stride:{x_stride} dtype:{dtype}" ) - x = torch.rand(x_shape, dtype=x_dtype).to(torch_device) - if x_stride is not None: - x = rearrange_tensor(x, x_stride) + x = torch.rand(x_shape, dtype=dtype).to(torch_device) + ans = causal_softmax(x) + + + x = rearrange_if_needed(x, x_stride) + x_tensor = to_tensor(x, lib) + descriptor = infiniopCausalSoftmaxDescriptor_t() check_error( lib.infiniopCreateCausalSoftmaxDescriptor( - handle, ctypes.byref(descriptor), x_tensor.descriptor + handle, + ctypes.byref(descriptor), + x_tensor.descriptor ) ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + x_tensor.descriptor.contents.invalidate() + + workspace_size = c_uint64(0) check_error( lib.infiniopGetCausalSoftmaxWorkspaceSize( descriptor, ctypes.byref(workspace_size) ) ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - x_tensor.descriptor.contents.invalidate() - workspace = create_workspace(workspace_size.value, x.device) - check_error( - lib.infiniopCausalSoftmax( - descriptor, - workspace.data_ptr() if workspace is not None else None, - workspace_size.value, - x_tensor.data, - None, + def lib_causal_softmax(): + check_error( + lib.infiniopCausalSoftmax( + descriptor, + workspace.data_ptr() if workspace is not None else None, + workspace_size.value, + x_tensor.data, + None, + ) ) - ) - assert torch.allclose(x, ans, atol=0, rtol=1e-2) - check_error(lib.infiniopDestroyCausalSoftmaxDescriptor(descriptor)) - - -def test_cpu(lib, test_cases): - device = DeviceEnum.DEVICE_CPU - handle = create_handle(lib, device) - for x_shape, x_stride in test_cases: - test(lib, handle, "cpu", x_shape, x_stride) - destroy_handle(lib, handle) - - -def test_cuda(lib, test_cases): - device = DeviceEnum.DEVICE_CUDA - handle = create_handle(lib, device) - for x_shape, x_stride in test_cases: - test(lib, handle, "cuda", x_shape, x_stride) - destroy_handle(lib, handle) - - -def test_bang(lib, test_cases): - import torch_mlu - - device = DeviceEnum.DEVICE_BANG - handle = create_handle(lib, device) - for x_shape, x_stride in test_cases: - test(lib, handle, "mlu", x_shape, x_stride) - destroy_handle(lib, handle) + lib_causal_softmax() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(x, ans, atol=atol, rtol=rtol) + assert torch.allclose(x, ans, atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: causal_softmax(x), torch_device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_causal_softmax(), torch_device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(lib.infiniopDestroyCausalSoftmaxDescriptor(descriptor)) -def test_ascend(lib, test_cases): - import torch_npu - device = DeviceEnum.DEVICE_ASCEND - handle = create_handle(lib, device) - for x_shape, x_stride in test_cases: - test(lib, handle, "npu", x_shape, x_stride) - destroy_handle(lib, handle) if __name__ == "__main__": - test_cases = [ - # x_shape, x_stride - ((32, 20, 512), None), - ((32, 20, 512), (20480, 512, 1)), # Ascend 暂不支持非连续 - ] + args = get_args() lib = open_lib() lib.infiniopCreateCausalSoftmaxDescriptor.restype = c_int32 @@ -144,15 +158,14 @@ def test_ascend(lib, test_cases): lib.infiniopDestroyCausalSoftmaxDescriptor.argtypes = [ infiniopCausalSoftmaxDescriptor_t, ] + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES) - if args.cpu: - test_cpu(lib, test_cases) - if args.cuda: - test_cuda(lib, test_cases) - if args.bang: - test_bang(lib, test_cases) - if args.ascend: - test_ascend(lib, test_cases) - if not (args.cpu or args.cuda or args.bang or args.ascend): - test_cpu(lib, test_cases) print("\033[92mTest passed!\033[0m") + diff --git a/test/infiniop/random_sample.py b/test/infiniop/random_sample.py index a5eb143ab..c2f4f0e5a 100644 --- a/test/infiniop/random_sample.py +++ b/test/infiniop/random_sample.py @@ -1,25 +1,47 @@ -from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float +import torch import ctypes -import sys -import os - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) -from operatorspy import ( - open_lib, - to_tensor, - DeviceEnum, +from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float +from libinfiniop import ( infiniopHandle_t, infiniopTensorDescriptor_t, - create_handle, - destroy_handle, + open_lib, + to_tensor, + get_test_devices, check_error, - rearrange_tensor, + rearrange_if_needed, create_workspace, - U64, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, ) -from operatorspy.tests.test_utils import get_args -import torch +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES = [ + # voc, random_val, topp, topk, temperature + (512, 0.8, 0.8, 3, 0.5), + (4096, 0.05, 0.9, 5, 1.0), + (16384, 0.15, 0.85, 10, 2.0), + (512, 0.08, 0, 3, 0.5), + (4096, 0.5, 0.9, 1, 1.0), + (16384, 0.15, 0, 1, 2.0), + (16384, 0.15, 0, 1, 2.0), + (32000, 0.08, 0.8, 50, 1.0), + (32000, 0.08, 1.0, 25, 1.0), + # (119696, 0.01, 1.0, 100, 1.0), +] + +# Data types used for testing +_TENSOR_DTYPES = [torch.float16, torch.float32] + + +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 class RandomSampleDescriptor(Structure): @@ -116,8 +138,8 @@ def test( ) # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - x_tensor.descriptor.contents.invalidate() - indices_tensor.descriptor.contents.invalidate() + for tensor in [x_tensor, indices_tensor]: + tensor.descriptor.contents.invalidate() workspace_size = c_uint64(0) check_error( @@ -126,77 +148,45 @@ def test( ) ) workspace = create_workspace(workspace_size.value, torch_device) - check_error( - lib.infiniopRandomSample( - descriptor, - workspace.data_ptr() if workspace is not None else None, - workspace_size.value, - indices_tensor.data, - x_tensor.data, - random_val, - topp, - topk, - temperature, - None, + + def lib_random_sample(): + check_error( + lib.infiniopRandomSample( + descriptor, + workspace.data_ptr() if workspace is not None else None, + workspace_size.value, + indices_tensor.data, + x_tensor.data, + random_val, + topp, + topk, + temperature, + None, + ) ) - ) if torch_device == "npu": torch.npu.synchronize() assert indices[0].type(ans.dtype) == ans or data[ans] == data[indices[0]] + + # Profiling workflow + if PROFILE: + # fmt: off + if topp > 0 and topk > 1: + profile_operation("PyTorch", lambda: random_sample( + data.to("cpu"), random_val, topp, topk, voc, temperature, "cpu" + ), torch_device, NUM_PRERUN, NUM_ITERATIONS) + else: + profile_operation("PyTorch", lambda: random_sample_0(data), torch_device, NUM_PRERUN, NUM_ITERATIONS) + + profile_operation(" lib", lambda: lib_random_sample(), torch_device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on check_error(lib.infiniopDestroyRandomSampleDescriptor(descriptor)) -def test_cpu(lib, test_cases): - device = DeviceEnum.DEVICE_CPU - handle = create_handle(lib, device) - for voc, random_val, topp, topk, temperature in test_cases: - test(lib, handle, "cpu", voc, random_val, topp, topk, temperature) - destroy_handle(lib, handle) - - -def test_cuda(lib, test_cases): - device = DeviceEnum.DEVICE_CUDA - handle = create_handle(lib, device) - for voc, random_val, topp, topk, temperature in test_cases: - test(lib, handle, "cuda", voc, random_val, topp, topk, temperature) - destroy_handle(lib, handle) - - -def test_bang(lib, test_cases): - import torch_mlu - - device = DeviceEnum.DEVICE_BANG - handle = create_handle(lib, device) - for voc, random_val, topp, topk, temperature in test_cases: - test(lib, handle, "mlu", voc, random_val, topp, topk, temperature) - destroy_handle(lib, handle) - - -def test_ascend(lib, test_cases): - import torch_npu - - device = DeviceEnum.DEVICE_ASCEND - handle = create_handle(lib, device) - for voc, random_val, topp, topk, temperature in test_cases: - test(lib, handle, "npu", voc, random_val, topp, topk, temperature) - destroy_handle(lib, handle) if __name__ == "__main__": - test_cases = [ - # voc, random_val, topp, topk, temperature - (512, 0.8, 0.8, 3, 0.5), - (4096, 0.05, 0.9, 5, 1.0), - (16384, 0.15, 0.85, 10, 2.0), - (512, 0.08, 0, 3, 0.5), - (4096, 0.5, 0.9, 1, 1.0), - (16384, 0.15, 0, 1, 2.0), - (16384, 0.15, 0, 1, 2.0), - (32000, 0.08, 0.8, 50, 1.0), - (32000, 0.08, 1.0, 25, 1.0), - # (119696, 0.01, 1.0, 100, 1.0), - ] args = get_args() lib = open_lib() @@ -229,14 +219,12 @@ def test_ascend(lib, test_cases): infiniopRandomSampleDescriptor_t, ] - if args.cpu: - test_cpu(lib, test_cases) - if args.cuda: - test_cuda(lib, test_cases) - if args.bang: - test_bang(lib, test_cases) - if args.ascend: - test_ascend(lib, test_cases) - if not (args.cpu or args.cuda or args.bang or args.ascend): - test_cpu(lib, test_cases) + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + # Execute tests + for device in get_test_devices(args): + test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES) + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/rearrange.py b/test/infiniop/rearrange.py index f9d5306c5..955ee1719 100644 --- a/test/infiniop/rearrange.py +++ b/test/infiniop/rearrange.py @@ -1,24 +1,51 @@ +import torch import ctypes -from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p -import sys -import os - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) -from operatorspy import ( - open_lib, - to_tensor, - CTensor, - DeviceEnum, +from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float +from libinfiniop import ( infiniopHandle_t, infiniopTensorDescriptor_t, - create_handle, - destroy_handle, + open_lib, + to_tensor, + get_test_devices, check_error, - rearrange_tensor, + rearrange_if_needed, + create_workspace, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, ) -from operatorspy.tests.test_utils import get_args -import torch +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES = [ + # ((src_shape, src_stride), (dst_shape, dst_stride)) + (((2, 4, 32), None), ((2, 4, 32), (256, 64, 1))), + (((32, 6, 64), (64, 2560, 1)), ((32, 6, 64), None)), + (((4, 6, 64), (64, 2560, 1)), ((4, 6, 64), (131072, 64, 1))), + (((1, 32, 64), (2048, 64, 1)), ((1, 32, 64), (2048, 64, 1))), + (((32, 1, 64), (64, 2560, 1)), ((32, 1, 64), (64, 64, 1))), + (((4, 1, 64), (64, 2560, 1)), ((4, 1, 64), (64, 11264, 1))), + (((64,), (1,)), ((64,), (1,))), +] + +# Data types used for testing +_TENSOR_DTYPES = [torch.float16, torch.float32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + torch.float16: {"atol": 0, "rtol": 1e-3}, + torch.float32: {"atol": 0, "rtol": 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + class RerrangeDescriptor(Structure): @@ -43,12 +70,13 @@ def test( ) x = torch.rand(x_shape, dtype=x_dtype).to(torch_device) y = torch.zeros(y_shape, dtype=x_dtype).to(torch_device) - if x_stride is not None: - x = rearrange_tensor(x, x_stride) - if y_stride is not None: - y = rearrange_tensor(y, y_stride) - x_tensor = to_tensor(x, lib) - y_tensor = to_tensor(y, lib) + + x, y = [ + rearrange_if_needed(tensor, stride) + for tensor, stride in zip([x, y], [x_stride, y_stride]) + ] + x_tensor, y_tensor = [to_tensor(tensor, lib) for tensor in [x, y]] + descriptor = infiniopRearrangeDescriptor_t() check_error( @@ -58,71 +86,42 @@ def test( ) # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - x_tensor.descriptor.contents.invalidate() - y_tensor.descriptor.contents.invalidate() + for tensor in [x_tensor, y_tensor]: + tensor.descriptor.contents.invalidate() + + def lib_rearrange(): + check_error( + lib.infiniopRearrange( + descriptor, + y_tensor.data, + x_tensor.data, + None + ) + ) + lib_rearrange() + + # Validate results + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(x, y, atol=atol, rtol=rtol) + assert torch.allclose(x, y, atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: rearrange_tensor(y, y_stride), torch_device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_rearrange(), torch_device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on - check_error(lib.infiniopRearrange(descriptor, y_tensor.data, x_tensor.data, None)) - assert torch.allclose(x, y, atol=0, rtol=1e-3) check_error(lib.infiniopDestroyRearrangeDescriptor(descriptor)) -def test_cpu(lib, test_cases): - device = DeviceEnum.DEVICE_CPU - handle = create_handle(lib, device) - for test_case in test_cases: - x_shape, x_stride = test_case[0] - y_shape, y_stride = test_case[1] - test(lib, handle, "cpu", x_shape, x_stride, y_shape, y_stride) - destroy_handle(lib, handle) - - -def test_cuda(lib, test_cases): - device = DeviceEnum.DEVICE_CUDA - handle = create_handle(lib, device) - for test_case in test_cases: - x_shape, x_stride = test_case[0] - y_shape, y_stride = test_case[1] - test(lib, handle, "cuda", x_shape, x_stride, y_shape, y_stride) - destroy_handle(lib, handle) - - -def test_bang(lib, test_cases): - import torch_mlu - - device = DeviceEnum.DEVICE_BANG - handle = create_handle(lib, device) - for test_case in test_cases: - x_shape, x_stride = test_case[0] - y_shape, y_stride = test_case[1] - test(lib, handle, "mlu", x_shape, x_stride, y_shape, y_stride) - destroy_handle(lib, handle) - - -def test_ascend(lib, test_cases): - import torch_npu - - device = DeviceEnum.DEVICE_ASCEND - handle = create_handle(lib, device) - for test_case in test_cases: - x_shape, x_stride = test_case[0] - y_shape, y_stride = test_case[1] - test(lib, handle, "npu", x_shape, x_stride, y_shape, y_stride) - destroy_handle(lib, handle) if __name__ == "__main__": args = get_args() - test_cases = [ - # ((src_shape, src_stride), (dst_shape, dst_stride)) - (((2, 4, 32), None), ((2, 4, 32), (256, 64, 1))), - (((32, 6, 64), (64, 2560, 1)), ((32, 6, 64), None)), - (((4, 6, 64), (64, 2560, 1)), ((4, 6, 64), (131072, 64, 1))), - (((1, 32, 64), (2048, 64, 1)), ((1, 32, 64), (2048, 64, 1))), - (((32, 1, 64), (64, 2560, 1)), ((32, 1, 64), (64, 64, 1))), - (((4, 1, 64), (64, 2560, 1)), ((4, 1, 64), (64, 11264, 1))), - (((64,), (1,)), ((64,), (1,))), - ] lib = open_lib() + lib.infiniopCreateRearrangeDescriptor.restype = c_int32 lib.infiniopCreateRearrangeDescriptor.argtypes = [ infiniopHandle_t, @@ -139,12 +138,15 @@ def test_ascend(lib, test_cases): ] lib.infiniopDestroyRearrangeDescriptor.restype = c_int32 lib.infiniopDestroyRearrangeDescriptor.argtypes = [infiniopRearrangeDescriptor_t] - if args.cpu: - test_cpu(lib, test_cases) - if args.cuda: - test_cuda(lib, test_cases) - if args.bang: - test_bang(lib, test_cases) - if args.ascend: - test_ascend(lib, test_cases) + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + # Execute tests + for device in get_test_devices(args): + test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES) + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/rms_norm.py b/test/infiniop/rms_norm.py index 21e27348e..0adf37241 100644 --- a/test/infiniop/rms_norm.py +++ b/test/infiniop/rms_norm.py @@ -1,25 +1,49 @@ from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float import ctypes -import sys -import os - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) -from operatorspy import ( - open_lib, - to_tensor, - DeviceEnum, +import torch +import ctypes +from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float +from libinfiniop import ( infiniopHandle_t, infiniopTensorDescriptor_t, - create_handle, - destroy_handle, + open_lib, + to_tensor, + get_test_devices, check_error, - rearrange_tensor, + rearrange_if_needed, create_workspace, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, ) -from operatorspy.tests.test_utils import get_args -import torch - +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules + +_TEST_CASES = [ + # y_shape, x_shape, w_shape, y_stride, x_stride, w_dtype + ((16, 2048), (16, 2048), (2048,), None, None,torch.float32), + ((16, 2048), (16, 2048), (2048,), None, None, torch.float16), + ((16, 2048), (16, 2048), (2048,), (4096, 1), (4096, 1),torch.float32), + ((16, 2048), (16, 2048), (2048,), (4096, 1), (4096, 1), torch.float16), +] +# x types used for testing +_TENSOR_DTYPES = [torch.float16, torch.float32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + torch.float16: {"atol": 0, "rtol": 1e-2}, + torch.float32: {"atol": 0, "rtol": 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 class RMSNormDescriptor(Structure): _fields_ = [("device", c_int32)] @@ -27,7 +51,6 @@ class RMSNormDescriptor(Structure): infiniopRMSNormDescriptor_t = POINTER(RMSNormDescriptor) - def rms_norm(x, w, eps): input_dtype = x.dtype hidden_states = x.to(torch.float32) @@ -37,19 +60,18 @@ def rms_norm(x, w, eps): def test( - lib, - handle, - torch_device, - y_shape, - x_shape, - w_shape, - dtype=torch.float16, - w_dtype=torch.float16, -): - print( - f"Testing RMS_Norm on {torch_device} with y_shape:{y_shape} x_shape:{x_shape} w_shape:{w_shape}" - f" dtype:{dtype} w_dtype:{w_dtype}" - ) + lib, + handle, + torch_device, + y_shape, + x_shape, + w_shape, + y_stride, + x_stride, + dtype=torch.float16, + w_dtype=torch.float16): + print(f"Testing RMS_Norm on {torch_device} with y_shape:{y_shape} x_shape:{x_shape} w_shape:{w_shape}" + f" dtype:{dtype} w_dtype:{w_dtype}") y = torch.zeros(y_shape, dtype=dtype).to(torch_device) x = torch.rand(x_shape, dtype=dtype).to(torch_device) @@ -58,93 +80,64 @@ def test( eps = 1e-5 ans = rms_norm(x, w, eps) - y_tensor = to_tensor(y, lib) - x_tensor = to_tensor(x, lib) - w_tensor = to_tensor(w, lib) + x = rearrange_if_needed(x, x_stride) + y = rearrange_if_needed(y, y_stride) + + x_tensor, y_tensor, w_tensor = [to_tensor(tensor, lib) for tensor in [x, y, w]] descriptor = infiniopRMSNormDescriptor_t() - w_dataType = 0 if w_dtype == torch.float16 else 1 + w_dataType = 0 if w_dtype==torch.float16 else 1 check_error( lib.infiniopCreateRMSNormDescriptor( - handle, - ctypes.byref(descriptor), - y_tensor.descriptor, - x_tensor.descriptor, - w_tensor.descriptor, - eps, + handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor, + w_tensor.descriptor, eps ) ) # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - x_tensor.descriptor.contents.invalidate() - y_tensor.descriptor.contents.invalidate() - w_tensor.descriptor.contents.invalidate() + for tensor in [x_tensor, y_tensor, w_tensor]: + tensor.descriptor.contents.invalidate() workspace_size = c_uint64(0) check_error( - lib.infiniopGetRMSNormWorkspaceSize(descriptor, ctypes.byref(workspace_size)) + lib.infiniopGetRMSNormWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) ) workspace = create_workspace(workspace_size.value, y.device) - check_error( - lib.infiniopRMSNorm( - descriptor, - workspace.data_ptr() if workspace is not None else None, - workspace_size.value, - y_tensor.data, - x_tensor.data, - w_tensor.data, - None, + def lib_rms_norm(): + check_error( + lib.infiniopRMSNorm( + descriptor, + workspace.data_ptr() if workspace is not None else None, + workspace_size.value, + y_tensor.data, + x_tensor.data, + w_tensor.data, + None, + ) ) - ) - assert torch.allclose(y.to(dtype), ans.to(dtype), atol=1e-3, rtol=1e-3) + lib_rms_norm() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y, ans, atol=atol, rtol=rtol) + assert torch.allclose(y, ans, atol=atol, rtol=rtol) + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: rms_norm(x, w, eps), torch_device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_rms_norm(), torch_device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on check_error(lib.infiniopDestroyRMSNormDescriptor(descriptor)) -def test_cpu(lib, test_cases): - device = DeviceEnum.DEVICE_CPU - handle = create_handle(lib, device) - for y_shape, x_shape, w_shape, dtype, w_dtype in test_cases: - test(lib, handle, "cpu", y_shape, x_shape, w_shape, dtype, w_dtype) - destroy_handle(lib, handle) - - -def test_cuda(lib, test_cases): - device = DeviceEnum.DEVICE_CUDA - handle = create_handle(lib, device) - for y_shape, x_shape, w_shape, dtype, w_dtype in test_cases: - test(lib, handle, "cuda", y_shape, x_shape, w_shape, dtype, w_dtype) - destroy_handle(lib, handle) - - -def test_bang(lib, test_cases): - import torch_mlu - - device = DeviceEnum.DEVICE_BANG - handle = create_handle(lib, device) - for y_shape, x_shape, w_shape, dtype, w_dtype in test_cases: - test(lib, handle, "mlu", y_shape, x_shape, w_shape, dtype, w_dtype) - destroy_handle(lib, handle) - - -def test_ascend(lib, test_cases): - import torch_npu - - device = DeviceEnum.DEVICE_ASCEND - handle = create_handle(lib, device) - for y_shape, x_shape, w_shape, dtype, w_dtype in test_cases: - test(lib, handle, "npu", y_shape, x_shape, w_shape, dtype, w_dtype) - - destroy_handle(lib, handle) if __name__ == "__main__": - test_cases = [ - # y_shape, x_shape, w_shape, dtype, w_dtype - ((16, 2048), (16, 2048), (2048,), torch.float16, torch.float16), - ((16, 2048), (16, 2048), (2048,), torch.float16, torch.float32), - ] + args = get_args() lib = open_lib() lib.infiniopCreateRMSNormDescriptor.restype = c_int32 @@ -178,14 +171,16 @@ def test_ascend(lib, test_cases): infiniopRMSNormDescriptor_t, ] - if args.cpu: - test_cpu(lib, test_cases) - if args.cuda: - test_cuda(lib, test_cases) - if args.bang: - test_bang(lib, test_cases) - if args.ascend: - test_ascend(lib, test_cases) - if not (args.cpu or args.cuda or args.bang or args.ascend): - test_cpu(lib, test_cases) + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + # Execute tests + for device in get_test_devices(args): + test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES) + print("\033[92mTest passed!\033[0m") + + diff --git a/test/infiniop/rotary_embedding.py b/test/infiniop/rotary_embedding.py index 9e9a29866..88fde17d0 100644 --- a/test/infiniop/rotary_embedding.py +++ b/test/infiniop/rotary_embedding.py @@ -1,9 +1,6 @@ +import torch import ctypes -from ctypes import POINTER, c_void_p, c_int32, c_uint64, Structure, byref -import sys -import os - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float from libinfiniop import ( infiniopHandle_t, infiniopTensorDescriptor_t, @@ -16,10 +13,33 @@ test_operator, get_args, debug, + get_tolerance, profile_operation, - InfiniDtype, ) -import torch + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES = [ + # (t_shape, t_strides) + ((1, 32, 128), None), + ((1, 32, 64), None), + # 昇腾暂不满足这个用例,最后一维度 <=32 会有问题,可能与其核心 + # 接口 GatherMask 的内部实现相关,目前 48 64 128 都可以支持 + ((4, 1, 32), None), + ((1, 32, 128), None), + ((3, 32, 128), (8000, 200, 1)), +] + +# Data types used for testing +_TENSOR_DTYPES = [torch.float16, torch.float32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + torch.float16: {"atol": 0, "rtol": 1e-2}, + torch.float32: {"atol": 0, "rtol": 1e-3}, +} DEBUG = False PROFILE = False @@ -27,6 +47,7 @@ NUM_ITERATIONS = 1000 + class RoPEDescriptor(Structure): _fields_ = [("device", c_int32)] @@ -75,13 +96,22 @@ def sin_cos_table(max_seq_len, dim, torch_device, theta): return torch.sin(angles), torch.cos(angles) -def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16): +def test( + lib, + handle, + torch_device, + shape, + strides=None, + dtype=torch.float16 +): print( f"Testing Rotary Positional Embedding on {torch_device} with shape:{shape} strides:{strides} and dtype:{dtype}" ) t = torch.rand(shape, dtype=dtype) - t = rearrange_if_needed(t, strides).to(torch_device) + + t = rearrange_if_needed(t, strides) + posTmp = torch.arange(0, t.shape[0]).to(torch_device) pos = torch.zeros(2 * posTmp.shape[0], dtype=torch.int32) for i in range(posTmp.shape[0]): @@ -95,11 +125,12 @@ def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16): descriptor = infiniopRoPEDescriptor_t() # 2x table length for test sin_table, cos_table = sin_cos_table(t.shape[0] * 2, t.shape[2], t.device, theta) - t_tensor = to_tensor(t, lib) + + t_tensor, sin_table_tensor, cos_table_tensor = [to_tensor(tensor, lib) for tensor in [t, sin_table, cos_table]] + pos_tensor = to_tensor(pos[: t.shape[0]], lib) pos_tensor.descriptor.contents.dtype = InfiniDtype.U64 - sin_table_tensor = to_tensor(sin_table, lib) - cos_table_tensor = to_tensor(cos_table, lib) + if torch_device == "npu": torch.npu.synchronize() @@ -116,10 +147,8 @@ def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16): ) # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - t_tensor.descriptor.contents.invalidate() - pos_tensor.descriptor.contents.invalidate() - sin_table_tensor.descriptor.contents.invalidate() - cos_table_tensor.descriptor.contents.invalidate() + for tensor in [t_tensor, pos_tensor, sin_table_tensor, cos_table_tensor]: + tensor.descriptor.contents.invalidate() workspace_size = c_uint64(0) check_error( @@ -142,9 +171,11 @@ def lib_rope(): ) lib_rope() + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) if DEBUG: - debug(t, ans, atol=1e-4, rtol=1e-2) - assert torch.allclose(t, ans, atol=1e-4, rtol=1e-2) + debug(t, ans, atol=atol, rtol=rtol) + assert torch.allclose(t, ans, atol=atol, rtol=rtol) + if PROFILE: profile_operation( "PyTorch", @@ -161,17 +192,6 @@ def lib_rope(): if __name__ == "__main__": - test_cases = [ - # (t_shape, t_strides) - ((1, 32, 128), None), - ((1, 32, 64), None), - # 昇腾暂不满足这个用例,最后一维度 <=32 会有问题,可能与其核心 - # 接口 GatherMask 的内部实现相关,目前 48 64 128 都可以支持 - ((4, 1, 32), None), - ((1, 32, 128), None), - ((3, 32, 128), (8000, 200, 1)), - ] - test_dtypes = [torch.float16] args = get_args() lib = open_lib() lib.infiniopCreateRoPEDescriptor.restype = c_int32 @@ -211,5 +231,5 @@ def lib_rope(): # Execute tests for device in get_test_devices(args): - test_operator(lib, device, test, test_cases, test_dtypes) + test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES) print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/swiglu.py b/test/infiniop/swiglu.py index 67b3c2b85..dd5608ab5 100644 --- a/test/infiniop/swiglu.py +++ b/test/infiniop/swiglu.py @@ -1,25 +1,50 @@ -from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p +import torch import ctypes -import sys -import os - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) -from operatorspy import ( - open_lib, - to_tensor, - CTensor, - DeviceEnum, +from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float +from libinfiniop import ( infiniopHandle_t, infiniopTensorDescriptor_t, - create_handle, - destroy_handle, + open_lib, + to_tensor, + get_test_devices, check_error, - rearrange_tensor, + rearrange_if_needed, + create_workspace, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, ) -from operatorspy.tests.test_utils import get_args -import torch - +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] +# Data types used for testing +_TENSOR_DTYPES = [torch.float16, torch.float32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + torch.float16: {'atol': 0, 'rtol': 1e-2}, + torch.float32: {'atol': 0, 'rtol': 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 class SwiGLUDescriptor(Structure): _fields_ = [("device", c_int32)] @@ -51,20 +76,18 @@ def test_out_of_place( b = torch.rand(shape, dtype=dtype).to(torch_device) c = torch.rand(shape, dtype=dtype).to(torch_device) - if a_stride is not None: - a = rearrange_tensor(a, a_stride) - if b_stride is not None: - b = rearrange_tensor(b, b_stride) - if c_stride is not None: - c = rearrange_tensor(c, c_stride) ans = swiglu(a, b) + a, b, c = [ + rearrange_if_needed(tensor, stride) + for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_stride]) + ] + a_tensor, b_tensor, c_tensor = [to_tensor(tensor, lib) for tensor in [a, b, c]] + + if sync is not None: sync() - a_tensor = to_tensor(a, lib) - b_tensor = to_tensor(b, lib) - c_tensor = to_tensor(c, lib) descriptor = infiniopSwiGLUDescriptor_t() check_error( lib.infiniopCreateSwiGLUDescriptor( @@ -77,19 +100,33 @@ def test_out_of_place( ) # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - a_tensor.descriptor.contents.invalidate() - b_tensor.descriptor.contents.invalidate() - c_tensor.descriptor.contents.invalidate() - - check_error( - lib.infiniopSwiGLU( - descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + for tensor in [a_tensor, b_tensor, c_tensor]: + tensor.descriptor.contents.invalidate() + + def lib_swiglu(): + check_error( + lib.infiniopSwiGLU( + descriptor, + c_tensor.data, + a_tensor.data, + b_tensor.data, + None + ) ) - ) + lib_swiglu() - assert torch.allclose(c, ans, atol=1e-4, rtol=1e-2) + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(c, ans, atol=atol, rtol=rtol) + assert torch.allclose(c, ans, atol=atol, rtol=rtol) print("out-of-place Test passed!") + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: swiglu(a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_swiglu(), torch_device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor)) @@ -106,18 +143,19 @@ def test_in_place1( a = torch.rand(shape, dtype=dtype).to(torch_device) b = torch.rand(shape, dtype=dtype).to(torch_device) - if a_stride is not None: - a = rearrange_tensor(a, a_stride) - if b_stride is not None: - b = rearrange_tensor(b, b_stride) ans = swiglu(a, b) if sync is not None: sync() - a_tensor = to_tensor(a, lib) - b_tensor = to_tensor(b, lib) + a, b = [ + rearrange_if_needed(tensor, stride) + for tensor, stride in zip([a, b], [a_stride, b_stride]) + ] + a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]] + descriptor = infiniopSwiGLUDescriptor_t() + check_error( lib.infiniopCreateSwiGLUDescriptor( handle, @@ -129,18 +167,27 @@ def test_in_place1( ) # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - a_tensor.descriptor.contents.invalidate() - b_tensor.descriptor.contents.invalidate() - - check_error( - lib.infiniopSwiGLU( - descriptor, a_tensor.data, a_tensor.data, b_tensor.data, None + for tensor in [a_tensor, b_tensor]: + tensor.descriptor.contents.invalidate() + def lib_swiglu(): + check_error( + lib.infiniopSwiGLU( + descriptor, a_tensor.data, a_tensor.data, b_tensor.data, None + ) ) - ) + lib_swiglu() - assert torch.allclose(a, ans, atol=1e-4, rtol=1e-2) + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(a, ans, atol=atol, rtol=rtol) + assert torch.allclose(a, ans, atol=atol, rtol=rtol) print("in-place1 Test passed!") - + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: swiglu(a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_swiglu(), torch_device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor)) @@ -157,17 +204,17 @@ def test_in_place2( a = torch.rand(shape, dtype=dtype).to(torch_device) b = torch.rand(shape, dtype=dtype).to(torch_device) - if a_stride is not None: - a = rearrange_tensor(a, a_stride) - if b_stride is not None: - b = rearrange_tensor(b, b_stride) ans = swiglu(a, b) if sync is not None: sync() - a_tensor = to_tensor(a, lib) - b_tensor = to_tensor(b, lib) + a, b = [ + rearrange_if_needed(tensor, stride) + for tensor, stride in zip([a, b], [a_stride, b_stride]) + ] + a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]] + descriptor = infiniopSwiGLUDescriptor_t() check_error( lib.infiniopCreateSwiGLUDescriptor( @@ -180,100 +227,42 @@ def test_in_place2( ) # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - a_tensor.descriptor.contents.invalidate() - b_tensor.descriptor.contents.invalidate() - - check_error( - lib.infiniopSwiGLU( - descriptor, b_tensor.data, a_tensor.data, b_tensor.data, None + for tensor in [a_tensor, b_tensor]: + tensor.descriptor.contents.invalidate() + + def lib_swiglu(): + check_error( + lib.infiniopSwiGLU( + descriptor, b_tensor.data, a_tensor.data, b_tensor.data, None + ) ) - ) - - assert torch.allclose(b, ans, atol=1e-4, rtol=1e-2) - + lib_swiglu() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(b, ans, atol=atol, rtol=rtol) + assert torch.allclose(b, ans, atol=atol, rtol=rtol) + print("in-place2 Test passed!") + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: swiglu(a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_swiglu(), torch_device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor)) -def test_cpu(lib, test_cases): - device = DeviceEnum.DEVICE_CPU - handle = create_handle(lib, device) - - for shape, a_stride, b_stride, c_stride, dtype in test_cases: - test_out_of_place( - lib, handle, "cpu", shape, a_stride, b_stride, c_stride, dtype - ) - test_in_place1(lib, handle, "cpu", shape, a_stride, b_stride, dtype) - test_in_place2(lib, handle, "cpu", shape, a_stride, b_stride, dtype) - - destroy_handle(lib, handle) - - -def test_cuda(lib, test_cases): - device = DeviceEnum.DEVICE_CUDA - handle = create_handle(lib, device) - - for shape, a_stride, b_stride, c_stride, dtype in test_cases: - test_out_of_place( - lib, handle, "cuda", shape, a_stride, b_stride, c_stride, dtype - ) - test_in_place1(lib, handle, "cuda", shape, a_stride, b_stride, dtype) - test_in_place2(lib, handle, "cuda", shape, a_stride, b_stride, dtype) - - destroy_handle(lib, handle) - - -def test_bang(lib, test_cases): - import torch_mlu - - device = DeviceEnum.DEVICE_BANG - handle = create_handle(lib, device) - - for shape, a_stride, b_stride, c_stride, dtype in test_cases: - test_out_of_place( - lib, handle, "mlu", shape, a_stride, b_stride, c_stride, dtype - ) - test_in_place1(lib, handle, "mlu", shape, a_stride, b_stride, dtype) - test_in_place2(lib, handle, "mlu", shape, a_stride, b_stride, dtype) - - destroy_handle(lib, handle) - - -def test_ascend(lib, test_cases): - import torch_npu - - device = DeviceEnum.DEVICE_ASCEND - handle = create_handle(lib, device) - - for shape, a_stride, b_stride, c_stride, dtype in test_cases: - test_out_of_place( - lib, - handle, - "npu", - shape, - a_stride, - b_stride, - c_stride, - dtype, - torch.npu.synchronize, - ) - test_in_place1( - lib, handle, "npu", shape, a_stride, b_stride, dtype, torch.npu.synchronize - ) - test_in_place2( - lib, handle, "npu", shape, a_stride, b_stride, dtype, torch.npu.synchronize - ) +def test(lib, handle, torch_device, shape, a_stride, b_stride, c_stride, dtype, sync = None): + test_out_of_place( + lib, handle, torch_device, shape, a_stride, b_stride, c_stride, dtype, sync + ) + test_in_place1(lib, handle, torch_device, shape, a_stride, b_stride, dtype, sync) + test_in_place2(lib, handle, torch_device, shape, a_stride, b_stride, dtype, sync) - destroy_handle(lib, handle) if __name__ == "__main__": - test_cases = [ - # shape, a_stride, b_stride, c_stride, dtype - ((13, 4), None, None, None, torch.float16), - ((13, 4), (10, 1), (10, 1), (10, 1), torch.float16), - ((16, 5632), None, None, None, torch.float16), - ((16, 5632), (13312, 1), (13312, 1), (13312, 1), torch.float16), - ] + args = get_args() lib = open_lib() @@ -299,13 +288,13 @@ def test_ascend(lib, test_cases): lib.infiniopDestroySwiGLUDescriptor.argtypes = [ infiniopSwiGLUDescriptor_t, ] + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES) - if args.cpu: - test_cpu(lib, test_cases) - if args.cuda: - test_cuda(lib, test_cases) - if args.bang: - test_bang(lib, test_cases) - if args.ascend: - test_ascend(lib, test_cases) print("\033[92mTest passed!\033[0m") From 04aa18f655f1ef1f259e1a9566f7bd37d042fecd Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Mon, 24 Feb 2025 14:03:30 +0800 Subject: [PATCH 2/5] issue/66: modified format --- test/infiniop/causal_softmax.py | 57 ++++---- test/infiniop/random_sample.py | 64 ++++++--- test/infiniop/rearrange.py | 39 +++-- test/infiniop/rms_norm.py | 63 +++++---- test/infiniop/rotary_embedding.py | 47 +++--- test/infiniop/swiglu.py | 228 ++++++++++-------------------- 6 files changed, 213 insertions(+), 285 deletions(-) diff --git a/test/infiniop/causal_softmax.py b/test/infiniop/causal_softmax.py index 64ba65acc..9f6385d9f 100644 --- a/test/infiniop/causal_softmax.py +++ b/test/infiniop/causal_softmax.py @@ -23,22 +23,20 @@ # These are not meant to be imported from other modules _TEST_CASES = [ - # x_shape, x_stride - ((32, 512), None), - ((32, 512), (1024, 1)), - ((32, 5, 5), None), - ((32, 20, 512), None), - ((32, 20, 512), (20480, 512, 1)), # Ascend 暂不支持非连续 - ((32, 20, 4, 512), None), - ((32, 20, 4, 512), (81920, 2048, 512, 1)), - ] + # x_shape, x_stride + ((32, 512), None), + ((32, 512), (1024, 1)), + ((32, 5, 5), None), + ((32, 20, 512), None), + ((32, 20, 512), (20480, 512, 1)), # Ascend 暂不支持非连续 +] + # Data types used for testing -_TENSOR_DTYPES = [torch.float16, torch.float32] +_TENSOR_DTYPES = [torch.float16] # Tolerance map for different data types _TOLERANCE_MAP = { - torch.float16: {'atol': 0, 'rtol': 1e-2}, - torch.float32: {'atol': 0, 'rtol': 1e-3}, + torch.float16: {"atol": 0, "rtol": 1e-2}, } DEBUG = False @@ -46,6 +44,7 @@ NUM_PRERUN = 10 NUM_ITERATIONS = 1000 + class CausalSoftmaxDescriptor(Structure): _fields_ = [("device", c_int32)] @@ -61,39 +60,29 @@ def causal_softmax(x): return torch.nn.functional.softmax(masked, dim=-1).to(type) -def test( - lib, - handle, - torch_device, - x_shape, - x_stride=None, - dtype=torch.float16 -): +def test(lib, handle, torch_device, x_shape, x_stride=None, dtype=torch.float16): print( f"Testing CausalSoftmax on {torch_device} with x_shape:{x_shape} x_stride:{x_stride} dtype:{dtype}" ) + x = torch.rand(x_shape, dtype=dtype).to(torch_device) ans = causal_softmax(x) - x = rearrange_if_needed(x, x_stride) - + x_tensor = to_tensor(x, lib) descriptor = infiniopCausalSoftmaxDescriptor_t() check_error( lib.infiniopCreateCausalSoftmaxDescriptor( - handle, - ctypes.byref(descriptor), - x_tensor.descriptor + handle, ctypes.byref(descriptor), x_tensor.descriptor ) ) # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel x_tensor.descriptor.contents.invalidate() - workspace_size = c_uint64(0) check_error( lib.infiniopGetCausalSoftmaxWorkspaceSize( @@ -101,6 +90,7 @@ def test( ) ) workspace = create_workspace(workspace_size.value, x.device) + def lib_causal_softmax(): check_error( lib.infiniopCausalSoftmax( @@ -111,8 +101,9 @@ def lib_causal_softmax(): None, ) ) + lib_causal_softmax() - + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) if DEBUG: debug(x, ans, atol=atol, rtol=rtol) @@ -128,24 +119,23 @@ def lib_causal_softmax(): check_error(lib.infiniopDestroyCausalSoftmaxDescriptor(descriptor)) - - - if __name__ == "__main__": - args = get_args() lib = open_lib() + lib.infiniopCreateCausalSoftmaxDescriptor.restype = c_int32 lib.infiniopCreateCausalSoftmaxDescriptor.argtypes = [ infiniopHandle_t, POINTER(infiniopCausalSoftmaxDescriptor_t), infiniopTensorDescriptor_t, ] + lib.infiniopGetCausalSoftmaxWorkspaceSize.restype = c_int32 lib.infiniopGetCausalSoftmaxWorkspaceSize.argtypes = [ infiniopCausalSoftmaxDescriptor_t, POINTER(c_uint64), ] + lib.infiniopCausalSoftmax.restype = c_int32 lib.infiniopCausalSoftmax.argtypes = [ infiniopCausalSoftmaxDescriptor_t, @@ -154,18 +144,19 @@ def lib_causal_softmax(): c_void_p, c_void_p, ] + lib.infiniopDestroyCausalSoftmaxDescriptor.restype = c_int32 lib.infiniopDestroyCausalSoftmaxDescriptor.argtypes = [ infiniopCausalSoftmaxDescriptor_t, ] + # Configure testing options DEBUG = args.debug PROFILE = args.profile NUM_PRERUN = args.num_prerun NUM_ITERATIONS = args.num_iterations - + for device in get_test_devices(args): test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES) print("\033[92mTest passed!\033[0m") - diff --git a/test/infiniop/random_sample.py b/test/infiniop/random_sample.py index c2f4f0e5a..c5741e245 100644 --- a/test/infiniop/random_sample.py +++ b/test/infiniop/random_sample.py @@ -12,33 +12,40 @@ create_workspace, test_operator, get_args, - debug, + debug_all, get_tolerance, profile_operation, + synchronize_device, ) # ============================================================================== # Configuration (Internal Use Only) # ============================================================================== # These are not meant to be imported from other modules + _TEST_CASES = [ # voc, random_val, topp, topk, temperature - (512, 0.8, 0.8, 3, 0.5), - (4096, 0.05, 0.9, 5, 1.0), - (16384, 0.15, 0.85, 10, 2.0), - (512, 0.08, 0, 3, 0.5), - (4096, 0.5, 0.9, 1, 1.0), - (16384, 0.15, 0, 1, 2.0), - (16384, 0.15, 0, 1, 2.0), - (32000, 0.08, 0.8, 50, 1.0), - (32000, 0.08, 1.0, 25, 1.0), - # (119696, 0.01, 1.0, 100, 1.0), + (512, 0.8, 0.8, 3, 0.5), + (4096, 0.05, 0.9, 5, 1.0), + (16384, 0.15, 0.85, 10, 2.0), + (512, 0.08, 0, 3, 0.5), + (4096, 0.5, 0.9, 1, 1.0), + (16384, 0.15, 0, 1, 2.0), + (16384, 0.15, 0, 1, 2.0), + (32000, 0.08, 0.8, 50, 1.0), + (32000, 0.08, 1.0, 25, 1.0), + # (119696, 0.01, 1.0, 100, 1.0), ] # Data types used for testing -_TENSOR_DTYPES = [torch.float16, torch.float32] +_TENSOR_DTYPES = [torch.float16] + +_TOLERANCE_MAP = { + torch.float16: {"atol": 0, "rtol": 0}, +} +DEBUG = False PROFILE = False NUM_PRERUN = 10 NUM_ITERATIONS = 1000 @@ -113,6 +120,7 @@ def test( x_dtype=torch.float16, ): print(f"Testing RandomSample on {torch_device} with voc:{voc} dtype:{x_dtype}") + data = torch.arange(voc).float() * 0.0001 _perm = torch.randperm(voc) data = data[_perm].to(x_dtype).to(torch_device) @@ -122,9 +130,11 @@ def test( ) else: ans = random_sample_0(data) + indices = torch.zeros([1], dtype=torch.int64).to(torch_device) - x_tensor = to_tensor(data, lib) - indices_tensor = to_tensor(indices, lib) + + x_tensor, indices_tensor = [to_tensor(tensor, lib) for tensor in [data, indices]] + indices_tensor.descriptor.contents.dt = U64 # treat int64 as uint64 descriptor = infiniopRandomSampleDescriptor_t() @@ -148,7 +158,7 @@ def test( ) ) workspace = create_workspace(workspace_size.value, torch_device) - + def lib_random_sample(): check_error( lib.infiniopRandomSample( @@ -164,11 +174,21 @@ def lib_random_sample(): None, ) ) - if torch_device == "npu": - torch.npu.synchronize() + if torch_device == "npu": + synchronize_device(torch_device) + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug_all( + (indices[0].type(ans.dtype), data[indices[0]]), + (ans, data[ans]), + "or", + atol=atol, + rtol=rtol, + ) assert indices[0].type(ans.dtype) == ans or data[ans] == data[indices[0]] - + # Profiling workflow if PROFILE: # fmt: off @@ -184,23 +204,23 @@ def lib_random_sample(): check_error(lib.infiniopDestroyRandomSampleDescriptor(descriptor)) - - if __name__ == "__main__": - args = get_args() lib = open_lib() + lib.infiniopCreateRandomSampleDescriptor.restype = c_int32 lib.infiniopCreateRandomSampleDescriptor.argtypes = [ infiniopHandle_t, POINTER(infiniopRandomSampleDescriptor_t), infiniopTensorDescriptor_t, ] + lib.infiniopGetRandomSampleWorkspaceSize.restype = c_int32 lib.infiniopGetRandomSampleWorkspaceSize.argtypes = [ infiniopRandomSampleDescriptor_t, POINTER(c_uint64), ] + lib.infiniopRandomSample.restype = c_int32 lib.infiniopRandomSample.argtypes = [ infiniopRandomSampleDescriptor_t, @@ -214,11 +234,13 @@ def lib_random_sample(): c_float, c_void_p, ] + lib.infiniopDestroyRandomSampleDescriptor.restype = c_int32 lib.infiniopDestroyRandomSampleDescriptor.argtypes = [ infiniopRandomSampleDescriptor_t, ] + DEBUG = args.debug PROFILE = args.profile NUM_PRERUN = args.num_prerun NUM_ITERATIONS = args.num_iterations diff --git a/test/infiniop/rearrange.py b/test/infiniop/rearrange.py index 955ee1719..0e54e9f5f 100644 --- a/test/infiniop/rearrange.py +++ b/test/infiniop/rearrange.py @@ -23,13 +23,13 @@ # These are not meant to be imported from other modules _TEST_CASES = [ # ((src_shape, src_stride), (dst_shape, dst_stride)) - (((2, 4, 32), None), ((2, 4, 32), (256, 64, 1))), - (((32, 6, 64), (64, 2560, 1)), ((32, 6, 64), None)), - (((4, 6, 64), (64, 2560, 1)), ((4, 6, 64), (131072, 64, 1))), - (((1, 32, 64), (2048, 64, 1)), ((1, 32, 64), (2048, 64, 1))), - (((32, 1, 64), (64, 2560, 1)), ((32, 1, 64), (64, 64, 1))), - (((4, 1, 64), (64, 2560, 1)), ((4, 1, 64), (64, 11264, 1))), - (((64,), (1,)), ((64,), (1,))), + (((2, 4, 32), None), ((2, 4, 32), (256, 64, 1))), + (((32, 6, 64), (64, 2560, 1)), ((32, 6, 64), None)), + (((4, 6, 64), (64, 2560, 1)), ((4, 6, 64), (131072, 64, 1))), + (((1, 32, 64), (2048, 64, 1)), ((1, 32, 64), (2048, 64, 1))), + (((32, 1, 64), (64, 2560, 1)), ((32, 1, 64), (64, 64, 1))), + (((4, 1, 64), (64, 2560, 1)), ((4, 1, 64), (64, 11264, 1))), + (((64,), (1,)), ((64,), (1,))), ] # Data types used for testing @@ -37,8 +37,8 @@ # Tolerance map for different data types _TOLERANCE_MAP = { - torch.float16: {"atol": 0, "rtol": 1e-3}, - torch.float32: {"atol": 0, "rtol": 1e-3}, + torch.float16: {"atol": 0, "rtol": 0}, + torch.float32: {"atol": 0, "rtol": 0}, } DEBUG = False @@ -47,7 +47,6 @@ NUM_ITERATIONS = 1000 - class RerrangeDescriptor(Structure): _fields_ = [("device", c_int32)] @@ -68,16 +67,16 @@ def test( print( f"Testing Rerrange on {torch_device} with x_shape:{x_shape} x_stride:{x_stride} y_shape:{y_shape} y_stride:{y_stride} x_dtype:{x_dtype}" ) + x = torch.rand(x_shape, dtype=x_dtype).to(torch_device) y = torch.zeros(y_shape, dtype=x_dtype).to(torch_device) - + x, y = [ rearrange_if_needed(tensor, stride) for tensor, stride in zip([x, y], [x_stride, y_stride]) ] x_tensor, y_tensor = [to_tensor(tensor, lib) for tensor in [x, y]] - descriptor = infiniopRearrangeDescriptor_t() check_error( lib.infiniopCreateRearrangeDescriptor( @@ -91,15 +90,11 @@ def test( def lib_rearrange(): check_error( - lib.infiniopRearrange( - descriptor, - y_tensor.data, - x_tensor.data, - None - ) + lib.infiniopRearrange(descriptor, y_tensor.data, x_tensor.data, None) ) + lib_rearrange() - + # Validate results atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) if DEBUG: @@ -116,8 +111,6 @@ def lib_rearrange(): check_error(lib.infiniopDestroyRearrangeDescriptor(descriptor)) - - if __name__ == "__main__": args = get_args() lib = open_lib() @@ -129,6 +122,7 @@ def lib_rearrange(): infiniopTensorDescriptor_t, infiniopTensorDescriptor_t, ] + lib.infiniopRearrange.restype = c_int32 lib.infiniopRearrange.argtypes = [ infiniopRearrangeDescriptor_t, @@ -136,9 +130,10 @@ def lib_rearrange(): c_void_p, c_void_p, ] + lib.infiniopDestroyRearrangeDescriptor.restype = c_int32 lib.infiniopDestroyRearrangeDescriptor.argtypes = [infiniopRearrangeDescriptor_t] - + # Configure testing options DEBUG = args.debug PROFILE = args.profile diff --git a/test/infiniop/rms_norm.py b/test/infiniop/rms_norm.py index 0adf37241..b60c44922 100644 --- a/test/infiniop/rms_norm.py +++ b/test/infiniop/rms_norm.py @@ -23,21 +23,20 @@ # Configuration (Internal Use Only) # ============================================================================== # These are not meant to be imported from other modules - _TEST_CASES = [ # y_shape, x_shape, w_shape, y_stride, x_stride, w_dtype - ((16, 2048), (16, 2048), (2048,), None, None,torch.float32), + ((16, 2048), (16, 2048), (2048,), None, None, torch.float32), ((16, 2048), (16, 2048), (2048,), None, None, torch.float16), - ((16, 2048), (16, 2048), (2048,), (4096, 1), (4096, 1),torch.float32), + ((16, 2048), (16, 2048), (2048,), (4096, 1), (4096, 1), torch.float32), ((16, 2048), (16, 2048), (2048,), (4096, 1), (4096, 1), torch.float16), ] + # x types used for testing -_TENSOR_DTYPES = [torch.float16, torch.float32] +_TENSOR_DTYPES = [torch.float16] # Tolerance map for different data types _TOLERANCE_MAP = { - torch.float16: {"atol": 0, "rtol": 1e-2}, - torch.float32: {"atol": 0, "rtol": 1e-3}, + torch.float16: {"atol": 1e-3, "rtol": 1e-3}, } DEBUG = False @@ -45,12 +44,14 @@ NUM_PRERUN = 10 NUM_ITERATIONS = 1000 + class RMSNormDescriptor(Structure): _fields_ = [("device", c_int32)] infiniopRMSNormDescriptor_t = POINTER(RMSNormDescriptor) + def rms_norm(x, w, eps): input_dtype = x.dtype hidden_states = x.to(torch.float32) @@ -60,18 +61,21 @@ def rms_norm(x, w, eps): def test( - lib, - handle, - torch_device, - y_shape, - x_shape, - w_shape, + lib, + handle, + torch_device, + y_shape, + x_shape, + w_shape, y_stride, x_stride, - dtype=torch.float16, - w_dtype=torch.float16): - print(f"Testing RMS_Norm on {torch_device} with y_shape:{y_shape} x_shape:{x_shape} w_shape:{w_shape}" - f" dtype:{dtype} w_dtype:{w_dtype}") + dtype=torch.float16, + w_dtype=torch.float16, +): + print( + f"Testing RMS_Norm on {torch_device} with y_shape:{y_shape} x_shape:{x_shape} w_shape:{w_shape}" + f" dtype:{dtype} w_dtype:{w_dtype}" + ) y = torch.zeros(y_shape, dtype=dtype).to(torch_device) x = torch.rand(x_shape, dtype=dtype).to(torch_device) @@ -80,18 +84,23 @@ def test( eps = 1e-5 ans = rms_norm(x, w, eps) - x = rearrange_if_needed(x, x_stride) - y = rearrange_if_needed(y, y_stride) + x, y = [ + rearrange_if_needed(tensor, stride) + for tensor, stride in zip([x, y], [x_stride, y_stride]) + ] x_tensor, y_tensor, w_tensor = [to_tensor(tensor, lib) for tensor in [x, y, w]] descriptor = infiniopRMSNormDescriptor_t() - w_dataType = 0 if w_dtype==torch.float16 else 1 check_error( lib.infiniopCreateRMSNormDescriptor( - handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor, - w_tensor.descriptor, eps + handle, + ctypes.byref(descriptor), + y_tensor.descriptor, + x_tensor.descriptor, + w_tensor.descriptor, + eps, ) ) @@ -101,11 +110,10 @@ def test( workspace_size = c_uint64(0) check_error( - lib.infiniopGetRMSNormWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) + lib.infiniopGetRMSNormWorkspaceSize(descriptor, ctypes.byref(workspace_size)) ) workspace = create_workspace(workspace_size.value, y.device) + def lib_rms_norm(): check_error( lib.infiniopRMSNorm( @@ -134,12 +142,10 @@ def lib_rms_norm(): check_error(lib.infiniopDestroyRMSNormDescriptor(descriptor)) - - if __name__ == "__main__": - args = get_args() lib = open_lib() + lib.infiniopCreateRMSNormDescriptor.restype = c_int32 lib.infiniopCreateRMSNormDescriptor.argtypes = [ infiniopHandle_t, @@ -166,6 +172,7 @@ def lib_rms_norm(): c_void_p, c_void_p, ] + lib.infiniopDestroyRMSNormDescriptor.restype = c_int32 lib.infiniopDestroyRMSNormDescriptor.argtypes = [ infiniopRMSNormDescriptor_t, @@ -182,5 +189,3 @@ def lib_rms_norm(): test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES) print("\033[92mTest passed!\033[0m") - - diff --git a/test/infiniop/rotary_embedding.py b/test/infiniop/rotary_embedding.py index 88fde17d0..64237c67f 100644 --- a/test/infiniop/rotary_embedding.py +++ b/test/infiniop/rotary_embedding.py @@ -15,6 +15,7 @@ debug, get_tolerance, profile_operation, + synchronize_device, ) # ============================================================================== @@ -23,22 +24,21 @@ # These are not meant to be imported from other modules _TEST_CASES = [ # (t_shape, t_strides) - ((1, 32, 128), None), - ((1, 32, 64), None), - # 昇腾暂不满足这个用例,最后一维度 <=32 会有问题,可能与其核心 - # 接口 GatherMask 的内部实现相关,目前 48 64 128 都可以支持 - ((4, 1, 32), None), - ((1, 32, 128), None), - ((3, 32, 128), (8000, 200, 1)), + ((1, 32, 128), None), + ((1, 32, 64), None), + # 昇腾暂不满足这个用例,最后一维度 <=32 会有问题,可能与其核心 + # 接口 GatherMask 的内部实现相关,目前 48 64 128 都可以支持 + ((4, 1, 32), None), + ((1, 32, 128), None), + ((3, 32, 128), (8000, 200, 1)), ] # Data types used for testing -_TENSOR_DTYPES = [torch.float16, torch.float32] +_TENSOR_DTYPES = [torch.float16] # Tolerance map for different data types _TOLERANCE_MAP = { - torch.float16: {"atol": 0, "rtol": 1e-2}, - torch.float32: {"atol": 0, "rtol": 1e-3}, + torch.float16: {"atol": 1e-4, "rtol": 1e-2}, } DEBUG = False @@ -47,7 +47,6 @@ NUM_ITERATIONS = 1000 - class RoPEDescriptor(Structure): _fields_ = [("device", c_int32)] @@ -96,14 +95,7 @@ def sin_cos_table(max_seq_len, dim, torch_device, theta): return torch.sin(angles), torch.cos(angles) -def test( - lib, - handle, - torch_device, - shape, - strides=None, - dtype=torch.float16 -): +def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16): print( f"Testing Rotary Positional Embedding on {torch_device} with shape:{shape} strides:{strides} and dtype:{dtype}" ) @@ -126,14 +118,15 @@ def test( # 2x table length for test sin_table, cos_table = sin_cos_table(t.shape[0] * 2, t.shape[2], t.device, theta) - t_tensor, sin_table_tensor, cos_table_tensor = [to_tensor(tensor, lib) for tensor in [t, sin_table, cos_table]] - + t_tensor, sin_table_tensor, cos_table_tensor = [ + to_tensor(tensor, lib) for tensor in [t, sin_table, cos_table] + ] + pos_tensor = to_tensor(pos[: t.shape[0]], lib) pos_tensor.descriptor.contents.dtype = InfiniDtype.U64 - if torch_device == "npu": - torch.npu.synchronize() + synchronize_device(torch_device) check_error( lib.infiniopCreateRoPEDescriptor( @@ -171,11 +164,12 @@ def lib_rope(): ) lib_rope() + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) if DEBUG: debug(t, ans, atol=atol, rtol=rtol) assert torch.allclose(t, ans, atol=atol, rtol=rtol) - + if PROFILE: profile_operation( "PyTorch", @@ -194,6 +188,7 @@ def lib_rope(): if __name__ == "__main__": args = get_args() lib = open_lib() + lib.infiniopCreateRoPEDescriptor.restype = c_int32 lib.infiniopCreateRoPEDescriptor.argtypes = [ infiniopHandle_t, @@ -203,11 +198,13 @@ def lib_rope(): infiniopTensorDescriptor_t, infiniopTensorDescriptor_t, ] + lib.infiniopGetRoPEWorkspaceSize.restype = c_int32 lib.infiniopGetRoPEWorkspaceSize.argtypes = [ infiniopRoPEDescriptor_t, POINTER(c_uint64), ] + lib.infiniopRoPE.restype = c_int32 lib.infiniopRoPE.argtypes = [ infiniopRoPEDescriptor_t, @@ -219,10 +216,12 @@ def lib_rope(): c_void_p, c_void_p, ] + lib.infiniopDestroyRoPEDescriptor.restype = c_int32 lib.infiniopDestroyRoPEDescriptor.argtypes = [ infiniopRoPEDescriptor_t, ] + # Configure testing options DEBUG = args.debug PROFILE = args.profile diff --git a/test/infiniop/swiglu.py b/test/infiniop/swiglu.py index dd5608ab5..fd933f8d9 100644 --- a/test/infiniop/swiglu.py +++ b/test/infiniop/swiglu.py @@ -16,29 +16,64 @@ get_tolerance, profile_operation, ) +from enum import Enum, auto # ============================================================================== # Configuration (Internal Use Only) # ============================================================================== # These are not meant to be imported from other modules _TEST_CASES = [ - # shape, a_stride, b_stride, c_stride - ((13, 4), None, None, None), - ((13, 4), (10, 1), (10, 1), (10, 1)), - ((13, 4, 4), None, None, None), - ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), - ((16, 5632), None, None, None), - ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), - ((4, 4, 5632), None, None, None), - ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), + # shape, a_stride, b_stride, c_stride, inplace + ((13, 4), None, None, None, Inplace.OUT_OF_PLACE), + ((13, 4), None, None, None, Inplace.INPLACE_A), + ((13, 4), None, None, None, Inplace.INPLACE_B), + ((13, 4), (10, 1), (10, 1), (10, 1), Inplace.OUT_OF_PLACE), + ((13, 4), (10, 1), (10, 1), (10, 1), Inplace.INPLACE_A), + ((13, 4), (10, 1), (10, 1), (10, 1), Inplace.INPLACE_B), + ((13, 4, 4), None, None, None, Inplace.OUT_OF_PLACE), + ((13, 4, 4), None, None, None, Inplace.INPLACE_A), + ((13, 4, 4), None, None, None, Inplace.INPLACE_B), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1), Inplace.OUT_OF_PLACE), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1), Inplace.INPLACE_A), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1), Inplace.INPLACE_B), + ((16, 5632), None, None, None, Inplace.OUT_OF_PLACE), + ((16, 5632), None, None, None, Inplace.INPLACE_A), + ((16, 5632), None, None, None, Inplace.INPLACE_B), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1), Inplace.OUT_OF_PLACE), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1), Inplace.INPLACE_A), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1), Inplace.INPLACE_B), + ((4, 4, 5632), None, None, None, Inplace.OUT_OF_PLACE), + ((4, 4, 5632), None, None, None, Inplace.INPLACE_A), + ((4, 4, 5632), None, None, None, Inplace.INPLACE_B), + ( + (4, 4, 5632), + (45056, 5632, 1), + (45056, 5632, 1), + (45056, 5632, 1), + Inplace.OUT_OF_PLACE, + ), + ( + (4, 4, 5632), + (45056, 5632, 1), + (45056, 5632, 1), + (45056, 5632, 1), + Inplace.INPLACE_A, + ), + ( + (4, 4, 5632), + (45056, 5632, 1), + (45056, 5632, 1), + (45056, 5632, 1), + Inplace.INPLACE_B, + ), ] + # Data types used for testing -_TENSOR_DTYPES = [torch.float16, torch.float32] +_TENSOR_DTYPES = [torch.float16] # Tolerance map for different data types _TOLERANCE_MAP = { - torch.float16: {'atol': 0, 'rtol': 1e-2}, - torch.float32: {'atol': 0, 'rtol': 1e-3}, + torch.float16: {"atol": 1e-4, "rtol": 1e-2}, } DEBUG = False @@ -46,6 +81,13 @@ NUM_PRERUN = 10 NUM_ITERATIONS = 1000 + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + class SwiGLUDescriptor(Structure): _fields_ = [("device", c_int32)] @@ -54,11 +96,10 @@ class SwiGLUDescriptor(Structure): def swiglu(a, b): - return a * b / (1 + torch.exp(-b.float()).to(b.dtype)) -def test_out_of_place( +def test( lib, handle, torch_device, @@ -66,15 +107,21 @@ def test_out_of_place( a_stride=None, b_stride=None, c_stride=None, + inplace=Inplace.OUT_OF_PLACE, dtype=torch.float16, sync=None, ): print( f"Testing SwiGLU on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} dtype:{dtype}" ) + a = torch.rand(shape, dtype=dtype).to(torch_device) b = torch.rand(shape, dtype=dtype).to(torch_device) - c = torch.rand(shape, dtype=dtype).to(torch_device) + c = ( + torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) + if inplace == Inplace.OUT_OF_PLACE + else (a if inplace == Inplace.INPLACE_A else b) + ) ans = swiglu(a, b) @@ -82,9 +129,12 @@ def test_out_of_place( rearrange_if_needed(tensor, stride) for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_stride]) ] - a_tensor, b_tensor, c_tensor = [to_tensor(tensor, lib) for tensor in [a, b, c]] - - + a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]] + c_tensor = ( + to_tensor(c, lib) + if inplace == Inplace.OUT_OF_PLACE + else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor) + ) if sync is not None: sync() @@ -106,13 +156,10 @@ def test_out_of_place( def lib_swiglu(): check_error( lib.infiniopSwiGLU( - descriptor, - c_tensor.data, - a_tensor.data, - b_tensor.data, - None + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None ) ) + lib_swiglu() atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) @@ -130,139 +177,7 @@ def lib_swiglu(): check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor)) -def test_in_place1( - lib, - handle, - torch_device, - shape, - a_stride=None, - b_stride=None, - dtype=torch.float16, - sync=None, -): - a = torch.rand(shape, dtype=dtype).to(torch_device) - b = torch.rand(shape, dtype=dtype).to(torch_device) - - ans = swiglu(a, b) - - if sync is not None: - sync() - - a, b = [ - rearrange_if_needed(tensor, stride) - for tensor, stride in zip([a, b], [a_stride, b_stride]) - ] - a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]] - - descriptor = infiniopSwiGLUDescriptor_t() - - check_error( - lib.infiniopCreateSwiGLUDescriptor( - handle, - ctypes.byref(descriptor), - a_tensor.descriptor, - a_tensor.descriptor, - b_tensor.descriptor, - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [a_tensor, b_tensor]: - tensor.descriptor.contents.invalidate() - def lib_swiglu(): - check_error( - lib.infiniopSwiGLU( - descriptor, a_tensor.data, a_tensor.data, b_tensor.data, None - ) - ) - lib_swiglu() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(a, ans, atol=atol, rtol=rtol) - assert torch.allclose(a, ans, atol=atol, rtol=rtol) - print("in-place1 Test passed!") - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: swiglu(a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_swiglu(), torch_device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor)) - - -def test_in_place2( - lib, - handle, - torch_device, - shape, - a_stride=None, - b_stride=None, - dtype=torch.float16, - sync=None, -): - a = torch.rand(shape, dtype=dtype).to(torch_device) - b = torch.rand(shape, dtype=dtype).to(torch_device) - - ans = swiglu(a, b) - - if sync is not None: - sync() - - a, b = [ - rearrange_if_needed(tensor, stride) - for tensor, stride in zip([a, b], [a_stride, b_stride]) - ] - a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]] - - descriptor = infiniopSwiGLUDescriptor_t() - check_error( - lib.infiniopCreateSwiGLUDescriptor( - handle, - ctypes.byref(descriptor), - b_tensor.descriptor, - a_tensor.descriptor, - b_tensor.descriptor, - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [a_tensor, b_tensor]: - tensor.descriptor.contents.invalidate() - - def lib_swiglu(): - check_error( - lib.infiniopSwiGLU( - descriptor, b_tensor.data, a_tensor.data, b_tensor.data, None - ) - ) - lib_swiglu() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(b, ans, atol=atol, rtol=rtol) - assert torch.allclose(b, ans, atol=atol, rtol=rtol) - print("in-place2 Test passed!") - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: swiglu(a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_swiglu(), torch_device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor)) - - -def test(lib, handle, torch_device, shape, a_stride, b_stride, c_stride, dtype, sync = None): - test_out_of_place( - lib, handle, torch_device, shape, a_stride, b_stride, c_stride, dtype, sync - ) - test_in_place1(lib, handle, torch_device, shape, a_stride, b_stride, dtype, sync) - test_in_place2(lib, handle, torch_device, shape, a_stride, b_stride, dtype, sync) - - - if __name__ == "__main__": - args = get_args() lib = open_lib() @@ -288,12 +203,13 @@ def test(lib, handle, torch_device, shape, a_stride, b_stride, c_stride, dtype, lib.infiniopDestroySwiGLUDescriptor.argtypes = [ infiniopSwiGLUDescriptor_t, ] + # Configure testing options DEBUG = args.debug PROFILE = args.profile NUM_PRERUN = args.num_prerun NUM_ITERATIONS = args.num_iterations - + for device in get_test_devices(args): test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES) From 08a29c28bfd926b3142e355fd6016b08ad8d4014 Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Mon, 24 Feb 2025 16:34:13 +0800 Subject: [PATCH 3/5] issue/66: modified random sample test function --- test/infiniop/causal_softmax.py | 1 - test/infiniop/random_sample.py | 102 +++++++++++++++----------------- 2 files changed, 49 insertions(+), 54 deletions(-) diff --git a/test/infiniop/causal_softmax.py b/test/infiniop/causal_softmax.py index 9f6385d9f..fd3f63957 100644 --- a/test/infiniop/causal_softmax.py +++ b/test/infiniop/causal_softmax.py @@ -21,7 +21,6 @@ # Configuration (Internal Use Only) # ============================================================================== # These are not meant to be imported from other modules - _TEST_CASES = [ # x_shape, x_stride ((32, 512), None), diff --git a/test/infiniop/random_sample.py b/test/infiniop/random_sample.py index c5741e245..fc0694184 100644 --- a/test/infiniop/random_sample.py +++ b/test/infiniop/random_sample.py @@ -22,7 +22,6 @@ # Configuration (Internal Use Only) # ============================================================================== # These are not meant to be imported from other modules - _TEST_CASES = [ # voc, random_val, topp, topk, temperature (512, 0.8, 0.8, 3, 0.5), @@ -59,53 +58,52 @@ class RandomSampleDescriptor(Structure): def random_sample(data, random_val, topp, topk, voc, temperature, torch_device): - indices = torch.zeros([topk], dtype=torch.int64) - dataNp = data.clone().detach() - sorted_indices = torch.arange(voc) - - for i in range(topk): - for j in range(i + 1, voc): - if dataNp[i] < dataNp[j]: - tmp = dataNp[i].clone().detach() - dataNp[i] = dataNp[j].clone().detach() - dataNp[j] = tmp - - tmpInd = sorted_indices[i].clone().detach() - sorted_indices[i] = sorted_indices[j].clone().detach() - sorted_indices[j] = tmpInd - - # sorted_indices = torch.argsort(dataNp, descending=True) - indices = sorted_indices[:topk] - - dataNp = dataNp[sorted_indices] - - globalM = dataNp[0] - dataNp = (dataNp - globalM) / temperature - dataNp = torch.softmax(dataNp.float(), dim=0) - sum_s = 0 - for end in range(topk): - sum_s += dataNp[end] - if sum_s >= topp: - break - if end < topk - 1: - end += 1 + if topp > 0 and topk > 1: + indices = torch.zeros([topk], dtype=torch.int64) + dataNp = data.clone().detach() + sorted_indices = torch.arange(voc) + + for i in range(topk): + for j in range(i + 1, voc): + if dataNp[i] < dataNp[j]: + tmp = dataNp[i].clone().detach() + dataNp[i] = dataNp[j].clone().detach() + dataNp[j] = tmp + + tmpInd = sorted_indices[i].clone().detach() + sorted_indices[i] = sorted_indices[j].clone().detach() + sorted_indices[j] = tmpInd + + # sorted_indices = torch.argsort(dataNp, descending=True) + indices = sorted_indices[:topk] + + dataNp = dataNp[sorted_indices] + + globalM = dataNp[0] + dataNp = (dataNp - globalM) / temperature + dataNp = torch.softmax(dataNp.float(), dim=0) + sum_s = 0 + for end in range(topk): + sum_s += dataNp[end] + if sum_s >= topp: + break + if end < topk - 1: + end += 1 + else: + end = topk + + sum_s = 0 + for i in range(end): + sum_s += dataNp[i] + random_val *= sum_s + + sum_s = 0 + for i in range(end): + sum_s += dataNp[i] + if random_val < sum_s: + return indices[i] else: - end = topk - - sum_s = 0 - for i in range(end): - sum_s += dataNp[i] - random_val *= sum_s - - sum_s = 0 - for i in range(end): - sum_s += dataNp[i] - if random_val < sum_s: - return indices[i] - - -def random_sample_0(data): - return torch.argmax(data) + return torch.argmax(data) def test( @@ -124,12 +122,10 @@ def test( data = torch.arange(voc).float() * 0.0001 _perm = torch.randperm(voc) data = data[_perm].to(x_dtype).to(torch_device) - if topp > 0 and topk > 1: - ans = random_sample( - data.to("cpu"), random_val, topp, topk, voc, temperature, "cpu" - ) - else: - ans = random_sample_0(data) + + ans = random_sample( + data, random_val, topp, topk, voc, temperature, torch_device + ) # 这个函数在device速度可能会很慢,可以通过data.to("cpu")方式加快计算过程 indices = torch.zeros([1], dtype=torch.int64).to(torch_device) From c0811ed4c6b112fa08fa9567b7e22ab01c7721c9 Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Tue, 25 Feb 2025 13:28:13 +0800 Subject: [PATCH 4/5] issue/66: modified random_sample, swiglu, rms_norm, test --- test/infiniop/random_sample.py | 8 ++--- test/infiniop/rms_norm.py | 1 + test/infiniop/swiglu.py | 66 ++++++++++++---------------------- 3 files changed, 25 insertions(+), 50 deletions(-) diff --git a/test/infiniop/random_sample.py b/test/infiniop/random_sample.py index fc0694184..9584d4925 100644 --- a/test/infiniop/random_sample.py +++ b/test/infiniop/random_sample.py @@ -188,13 +188,9 @@ def lib_random_sample(): # Profiling workflow if PROFILE: # fmt: off - if topp > 0 and topk > 1: - profile_operation("PyTorch", lambda: random_sample( - data.to("cpu"), random_val, topp, topk, voc, temperature, "cpu" + profile_operation("PyTorch", lambda: random_sample( + data, random_val, topp, topk, voc, temperature, torch_device ), torch_device, NUM_PRERUN, NUM_ITERATIONS) - else: - profile_operation("PyTorch", lambda: random_sample_0(data), torch_device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_random_sample(), torch_device, NUM_PRERUN, NUM_ITERATIONS) # fmt: on check_error(lib.infiniopDestroyRandomSampleDescriptor(descriptor)) diff --git a/test/infiniop/rms_norm.py b/test/infiniop/rms_norm.py index b60c44922..eaab61fd0 100644 --- a/test/infiniop/rms_norm.py +++ b/test/infiniop/rms_norm.py @@ -133,6 +133,7 @@ def lib_rms_norm(): if DEBUG: debug(y, ans, atol=atol, rtol=rtol) assert torch.allclose(y, ans, atol=atol, rtol=rtol) + # Profiling workflow if PROFILE: # fmt: off diff --git a/test/infiniop/swiglu.py b/test/infiniop/swiglu.py index fd933f8d9..427db4d82 100644 --- a/test/infiniop/swiglu.py +++ b/test/infiniop/swiglu.py @@ -22,50 +22,29 @@ # Configuration (Internal Use Only) # ============================================================================== # These are not meant to be imported from other modules +_TEST_CASES_ = [ + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + "Inplace.OUT_OF_PLACE", + "Inplace.INPLACE_A", + "Inplace.INPLACE_B", +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ _TEST_CASES = [ - # shape, a_stride, b_stride, c_stride, inplace - ((13, 4), None, None, None, Inplace.OUT_OF_PLACE), - ((13, 4), None, None, None, Inplace.INPLACE_A), - ((13, 4), None, None, None, Inplace.INPLACE_B), - ((13, 4), (10, 1), (10, 1), (10, 1), Inplace.OUT_OF_PLACE), - ((13, 4), (10, 1), (10, 1), (10, 1), Inplace.INPLACE_A), - ((13, 4), (10, 1), (10, 1), (10, 1), Inplace.INPLACE_B), - ((13, 4, 4), None, None, None, Inplace.OUT_OF_PLACE), - ((13, 4, 4), None, None, None, Inplace.INPLACE_A), - ((13, 4, 4), None, None, None, Inplace.INPLACE_B), - ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1), Inplace.OUT_OF_PLACE), - ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1), Inplace.INPLACE_A), - ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1), Inplace.INPLACE_B), - ((16, 5632), None, None, None, Inplace.OUT_OF_PLACE), - ((16, 5632), None, None, None, Inplace.INPLACE_A), - ((16, 5632), None, None, None, Inplace.INPLACE_B), - ((16, 5632), (13312, 1), (13312, 1), (13312, 1), Inplace.OUT_OF_PLACE), - ((16, 5632), (13312, 1), (13312, 1), (13312, 1), Inplace.INPLACE_A), - ((16, 5632), (13312, 1), (13312, 1), (13312, 1), Inplace.INPLACE_B), - ((4, 4, 5632), None, None, None, Inplace.OUT_OF_PLACE), - ((4, 4, 5632), None, None, None, Inplace.INPLACE_A), - ((4, 4, 5632), None, None, None, Inplace.INPLACE_B), - ( - (4, 4, 5632), - (45056, 5632, 1), - (45056, 5632, 1), - (45056, 5632, 1), - Inplace.OUT_OF_PLACE, - ), - ( - (4, 4, 5632), - (45056, 5632, 1), - (45056, 5632, 1), - (45056, 5632, 1), - Inplace.INPLACE_A, - ), - ( - (4, 4, 5632), - (45056, 5632, 1), - (45056, 5632, 1), - (45056, 5632, 1), - Inplace.INPLACE_B, - ), + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE ] # Data types used for testing @@ -166,7 +145,6 @@ def lib_swiglu(): if DEBUG: debug(c, ans, atol=atol, rtol=rtol) assert torch.allclose(c, ans, atol=atol, rtol=rtol) - print("out-of-place Test passed!") # Profiling workflow if PROFILE: From 642e8de0a9ea3e9c9180e8a1905dc34f6b748ad3 Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Tue, 25 Feb 2025 14:09:01 +0800 Subject: [PATCH 5/5] issue/66: add lib_random_sample() --- test/infiniop/random_sample.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/infiniop/random_sample.py b/test/infiniop/random_sample.py index 9584d4925..8ca1fdf97 100644 --- a/test/infiniop/random_sample.py +++ b/test/infiniop/random_sample.py @@ -171,6 +171,8 @@ def lib_random_sample(): ) ) + lib_random_sample() + if torch_device == "npu": synchronize_device(torch_device)