From 3195bf00e08c343205058a5079c5ab8bc6841035 Mon Sep 17 00:00:00 2001 From: Zimin Li Date: Thu, 16 Jan 2025 17:21:42 +0800 Subject: [PATCH] Refactored the tests with profiling --- operatorspy/tests/add.py | 70 ++++-- operatorspy/tests/attention.py | 190 ++++++++++------ operatorspy/tests/avg_pool.py | 100 ++++---- operatorspy/tests/causal_softmax.py | 81 +++++-- operatorspy/tests/conv.py | 104 +++++---- operatorspy/tests/expand.py | 81 ++++--- operatorspy/tests/gemm.py | 103 +++++---- operatorspy/tests/global_avg_pool.py | 94 ++++---- operatorspy/tests/max_pool.py | 100 ++++---- operatorspy/tests/mlp.py | 175 +++++++++----- operatorspy/tests/random_sample.py | 97 ++++++-- operatorspy/tests/rearrange.py | 80 +++++-- operatorspy/tests/relu.py | 78 ++++--- operatorspy/tests/rms_norm.py | 107 +++++++-- operatorspy/tests/rotary_embedding.py | 117 ++++++++-- operatorspy/tests/swiglu.py | 315 ++++++++++++++------------ 16 files changed, 1242 insertions(+), 650 deletions(-) diff --git a/operatorspy/tests/add.py b/operatorspy/tests/add.py index 455014cc..cce981bf 100644 --- a/operatorspy/tests/add.py +++ b/operatorspy/tests/add.py @@ -2,6 +2,7 @@ import ctypes import sys import os +import time sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) from operatorspy import ( @@ -15,10 +16,13 @@ check_error, ) -from operatorspy.tests.test_utils import get_args +from operatorspy.tests.test_utils import get_args, synchronize_device from enum import Enum, auto import torch +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 class Inplace(Enum): OUT_OF_PLACE = auto() @@ -83,36 +87,65 @@ def test( check_error( lib.infiniopAdd(descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None) ) + assert torch.allclose(c, ans, atol=0, rtol=1e-3) + + if PROFILE: + # Profiling PyTorch implementation + for i in range(NUM_PRERUN): + _ = add(a, b) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = add(a, b) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" pytorch time: {elapsed * 1000 :6f} ms") + + # Profiling C Operators implementation + for i in range(NUM_PRERUN): + check_error( + lib.infiniopAdd(descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None) + ) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopAdd(descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None) + ) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed * 1000 :6f} ms") + check_error(lib.infiniopDestroyAddDescriptor(descriptor)) -def test_cpu(lib, test_cases): +def test_cpu(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CPU handle = create_handle(lib, device) for c_shape, a_shape, b_shape, inplace in test_cases: - test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) - test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=tensor_dtype, inplace=inplace) destroy_handle(lib, handle) -def test_cuda(lib, test_cases): +def test_cuda(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CUDA handle = create_handle(lib, device) for c_shape, a_shape, b_shape, inplace in test_cases: - test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) - test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=tensor_dtype, inplace=inplace) destroy_handle(lib, handle) -def test_bang(lib, test_cases): +def test_bang(lib, test_cases, tensor_dtypes): import torch_mlu device = DeviceEnum.DEVICE_BANG handle = create_handle(lib, device) for c_shape, a_shape, b_shape, inplace in test_cases: - test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) - test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=tensor_dtype, inplace=inplace) destroy_handle(lib, handle) @@ -126,14 +159,17 @@ def test_bang(lib, test_cases): ((), (), (), Inplace.OUT_OF_PLACE), ((3, 3), (3, 3), (3, 3), Inplace.OUT_OF_PLACE), ((2, 20, 3), (2, 1, 3), (2, 20, 3), Inplace.OUT_OF_PLACE), - ((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_A), - ((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_B), + ((3, 20, 512), (3, 20, 512), (3, 20, 512), Inplace.INPLACE_A), + ((3, 20, 512), (3, 20, 512), (3, 20, 512), Inplace.INPLACE_B), ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE), ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE), ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE), ] + tensor_dtypes = [ + torch.float16, torch.float32, + ] args = get_args() lib = open_lib() lib.infiniopCreateAddDescriptor.restype = c_int32 @@ -157,12 +193,14 @@ def test_bang(lib, test_cases): infiniopAddDescriptor_t, ] + if args.profile: + PROFILE = True if args.cpu: - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) if args.cuda: - test_cuda(lib, test_cases) + test_cuda(lib, test_cases, tensor_dtypes) if args.bang: - test_bang(lib, test_cases) + test_bang(lib, test_cases, tensor_dtypes) if not (args.cpu or args.cuda or args.bang): - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/attention.py b/operatorspy/tests/attention.py index f5449aaa..2225bc6d 100644 --- a/operatorspy/tests/attention.py +++ b/operatorspy/tests/attention.py @@ -2,6 +2,7 @@ import ctypes import sys import os +import time sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) from operatorspy import ( @@ -18,10 +19,13 @@ create_workspace, ) -from operatorspy.tests.test_utils import get_args +from operatorspy.tests.test_utils import get_args, synchronize_device import torch import torch.nn.functional as F +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 class AttentionDescriptor(Structure): _fields_ = [("device", c_int32)] @@ -186,10 +190,59 @@ def test( assert torch.allclose(out, ans, atol=1e-4, rtol=1e-2) + if PROFILE: + # Profiling PyTorch implementation + for i in range(NUM_PRERUN): + _ = attention(q, k, v, k_cache, v_cache, pos) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = attention(q, k, v, k_cache, v_cache, pos) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" pytorch time: {elapsed * 1000 :6f} ms") + + # Profiling C Operators implementation + for i in range(NUM_PRERUN): + check_error( + lib.infiniopAttention( + descriptor, + workspace.data_ptr() if workspace is not None else None, + workspace_size.value, + out_tensor.data, + q_tensor.data, + k_tensor.data, + v_tensor.data, + k_cache_tensor.data, + v_cache_tensor.data, + None, + ) + ) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopAttention( + descriptor, + workspace.data_ptr() if workspace is not None else None, + workspace_size.value, + out_tensor.data, + q_tensor.data, + k_tensor.data, + v_tensor.data, + k_cache_tensor.data, + v_cache_tensor.data, + None, + ) + ) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed * 1000 :6f} ms") + check_error(lib.infiniopDestroyAttentionDescriptor(descriptor)) -def test_cpu(lib, test_cases): +def test_cpu(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CPU handle = create_handle(lib, device) @@ -201,36 +254,36 @@ def test_cpu(lib, test_cases): pos, k_cache_buf_len, v_cache_buf_len, - dtype, q_stride, k_stride, v_stride, k_cache_stride, v_cache_stride, ) in test_cases: - test( - lib, - handle, - "cpu", - n_q_head, - n_kv_head, - seq_len, - head_dim, - pos, - k_cache_buf_len, - v_cache_buf_len, - dtype, - q_stride, - k_stride, - v_stride, - k_cache_stride, - v_cache_stride, - ) + for tensor_dtype in tensor_dtypes: + test( + lib, + handle, + "cpu", + n_q_head, + n_kv_head, + seq_len, + head_dim, + pos, + k_cache_buf_len, + v_cache_buf_len, + tensor_dtype, + q_stride, + k_stride, + v_stride, + k_cache_stride, + v_cache_stride, + ) destroy_handle(lib, handle) -def test_cuda(lib, test_cases): +def test_cuda(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CUDA handle = create_handle(lib, device) @@ -242,36 +295,36 @@ def test_cuda(lib, test_cases): pos, k_cache_buf_len, v_cache_buf_len, - dtype, q_stride, k_stride, v_stride, k_cache_stride, v_cache_stride, ) in test_cases: - test( - lib, - handle, - "cuda", - n_q_head, - n_kv_head, - seq_len, - head_dim, - pos, - k_cache_buf_len, - v_cache_buf_len, - dtype, - q_stride, - k_stride, - v_stride, - k_cache_stride, - v_cache_stride, - ) + for tensor_dtype in tensor_dtypes: + test( + lib, + handle, + "cuda", + n_q_head, + n_kv_head, + seq_len, + head_dim, + pos, + k_cache_buf_len, + v_cache_buf_len, + tensor_dtype, + q_stride, + k_stride, + v_stride, + k_cache_stride, + v_cache_stride, + ) destroy_handle(lib, handle) -def test_bang(lib, test_cases): +def test_bang(lib, test_cases, tensor_dtypes): import torch_mlu device = DeviceEnum.DEVICE_BANG @@ -292,24 +345,25 @@ def test_bang(lib, test_cases): k_cache_stride, v_cache_stride, ) in test_cases: - test( - lib, - handle, - "mlu", - n_q_head, - n_kv_head, - seq_len, - head_dim, - pos, - k_cache_buf_len, - v_cache_buf_len, - dtype, - q_stride, - k_stride, - v_stride, - k_cache_stride, - v_cache_stride, - ) + for tensor_dtype in tensor_dtypes: + test( + lib, + handle, + "mlu", + n_q_head, + n_kv_head, + seq_len, + head_dim, + pos, + k_cache_buf_len, + v_cache_buf_len, + dtype, + q_stride, + k_stride, + v_stride, + k_cache_stride, + v_cache_stride, + ) destroy_handle(lib, handle) @@ -325,7 +379,6 @@ def test_bang(lib, test_cases): 0, # pos 2048, # k_cache_buf_len 2048, # v_cache_buf_len - torch.float16, # dtype [64, 2560, 1], # q_stride [64, 2560, 1], # k_stride [64, 2560, 1], # v_stride @@ -341,7 +394,6 @@ def test_bang(lib, test_cases): 3, # pos 2048, # k_cache_buf_len 2048, # v_cache_buf_len - torch.float16, # dtype [64, 2560, 1], # q_stride [64, 2560, 1], # k_stride [64, 2560, 1], # v_stride @@ -357,7 +409,6 @@ def test_bang(lib, test_cases): 1, # pos 8, # k_cache_buf_len 8, # v_cache_buf_len - torch.float16, # dtype None, # q_stride None, # k_stride None, # v_stride @@ -365,6 +416,9 @@ def test_bang(lib, test_cases): None, # v_cache_stride ), ] + tensor_dtypes = [ + torch.float16, + ] args = get_args() lib = open_lib() @@ -406,12 +460,14 @@ def test_bang(lib, test_cases): infiniopAttentionDescriptor_t, ] + if args.profile: + PROFILE = True if args.cpu: - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) if args.cuda: - test_cuda(lib, test_cases) + test_cuda(lib, test_cases, tensor_dtypes) if args.bang: - test_bang(lib, test_cases) + test_bang(lib, test_cases, tensor_dtypes) if not (args.cpu or args.cuda or args.bang): - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/avg_pool.py b/operatorspy/tests/avg_pool.py index 9c240789..f0530327 100644 --- a/operatorspy/tests/avg_pool.py +++ b/operatorspy/tests/avg_pool.py @@ -16,13 +16,10 @@ check_error, ) -from operatorspy.tests.test_utils import get_args +from operatorspy.tests.test_utils import get_args, synchronize_device import torch from typing import Tuple -# constant for control whether profile the pytorch and lib functions -# NOTE: need to manually add synchronization function to the lib function, -# e.g., cudaDeviceSynchronize() for CUDA PROFILE = False NUM_PRERUN = 10 NUM_ITERATIONS = 1000 @@ -51,8 +48,6 @@ def pool(x, k, padding, stride, dilation = 1): ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x.to(torch.float32)).to(torch.float16) else: ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x) - if PROFILE: - torch.cuda.synchronize() return ans @@ -92,14 +87,7 @@ def test( x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device) y = torch.rand(inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype).to(torch_device) - for i in range(NUM_PRERUN if PROFILE else 1): - ans = pool(x, k_shape, padding, strides) - if PROFILE: - start_time = time.time() - for i in range(NUM_ITERATIONS): - _ = pool(x, k_shape, padding, strides) - elapsed = (time.time() - start_time) / NUM_ITERATIONS - print(f"pytorch time: {elapsed :6f}") + ans = pool(x, k_shape, padding, strides) x_tensor = to_tensor(x, lib) y_tensor = to_tensor(y, lib) @@ -129,18 +117,47 @@ def test( workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device) workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8)) - for i in range(NUM_PRERUN if PROFILE else 1): - check_error( - lib.infiniopAvgPool( - descriptor, - workspace_ptr, - workspaceSize, - y_tensor.data, - x_tensor.data, - None, - ) + check_error( + lib.infiniopAvgPool( + descriptor, + workspace_ptr, + workspaceSize, + y_tensor.data, + x_tensor.data, + None, ) + ) + + if (tensor_dtype == torch.float16): + assert torch.allclose(y, ans, atol=0, rtol=1e-3) + else: + assert torch.allclose(y, ans, atol=1e-6, rtol=1e-5) + if PROFILE: + # Profiling PyTorch implementation + for i in range(NUM_PRERUN): + _ = pool(x, k_shape, padding, strides) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = pool(x, k_shape, padding, strides) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" pytorch time: {elapsed * 1000 :6f} ms") + + # Profiling C Operators implementation + for i in range(NUM_PRERUN): + check_error( + lib.infiniopAvgPool( + descriptor, + workspace_ptr, + workspaceSize, + y_tensor.data, + x_tensor.data, + None, + ) + ) + synchronize_device(torch_device) start_time = time.time() for i in range(NUM_ITERATIONS): check_error( @@ -153,39 +170,39 @@ def test( None, ) ) + synchronize_device(torch_device) elapsed = (time.time() - start_time) / NUM_ITERATIONS - print(f" lib time: {elapsed :6f}") + print(f" lib time: {elapsed * 1000 :6f} ms") - assert torch.allclose(y, ans, atol=0, rtol=1e-3) check_error(lib.infiniopDestroyAvgPoolDescriptor(descriptor)) -def test_cpu(lib, test_cases): +def test_cpu(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CPU handle = create_handle(lib, device) for x_shape, kernel_shape, padding, strides in test_cases: - test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16) - test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=tensor_dtype) destroy_handle(lib, handle) -def test_cuda(lib, test_cases): +def test_cuda(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CUDA handle = create_handle(lib, device) for x_shape, kernel_shape, padding, strides in test_cases: - test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16) - test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=tensor_dtype) destroy_handle(lib, handle) -def test_bang(lib, test_cases): +def test_bang(lib, test_cases, tensor_dtypes): import torch_mlu device = DeviceEnum.DEVICE_BANG handle = create_handle(lib, device) for x_shape, kernel_shape, padding, strides in test_cases: - test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16) - test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=tensor_dtype) destroy_handle(lib, handle) @@ -196,6 +213,9 @@ def test_bang(lib, test_cases): ((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)), ((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)), ] + tensor_dtypes = [ + torch.float16, torch.float32, + ] args = get_args() lib = open_lib() lib.infiniopCreateAvgPoolDescriptor.restype = c_int32 @@ -228,12 +248,14 @@ def test_bang(lib, test_cases): infiniopAvgPoolDescriptor_t, ] + if args.profile: + PROFILE = True if args.cpu: - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) if args.cuda: - test_cuda(lib, test_cases) + test_cuda(lib, test_cases, tensor_dtypes) if args.bang: - test_bang(lib, test_cases) + test_bang(lib, test_cases, tensor_dtypes) if not (args.cpu or args.cuda or args.bang): - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/causal_softmax.py b/operatorspy/tests/causal_softmax.py index 1ad304b2..726bd5ca 100644 --- a/operatorspy/tests/causal_softmax.py +++ b/operatorspy/tests/causal_softmax.py @@ -2,6 +2,7 @@ import ctypes import sys import os +import time sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) @@ -18,9 +19,12 @@ create_workspace, ) -from operatorspy.tests.test_utils import get_args +from operatorspy.tests.test_utils import get_args, synchronize_device import torch +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 class CausalSoftmaxDescriptor(Structure): _fields_ = [("device", c_int32)] @@ -73,41 +77,85 @@ def test(lib, handle, torch_device, x_shape, x_stride=None, x_dtype=torch.float1 ) ) assert torch.allclose(x, ans, atol=0, rtol=1e-2) + + if PROFILE: + # Profiling PyTorch implementation + for i in range(NUM_PRERUN): + _ = causal_softmax(x) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = causal_softmax(x) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" pytorch time: {elapsed * 1000 :6f} ms") + + # Profiling C Operators implementation + for i in range(NUM_PRERUN): + check_error( + lib.infiniopCausalSoftmax( + descriptor, + workspace.data_ptr() if workspace is not None else None, + workspace_size.value, + x_tensor.data, + None, + ) + ) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopCausalSoftmax( + descriptor, + workspace.data_ptr() if workspace is not None else None, + workspace_size.value, + x_tensor.data, + None, + ) + ) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed * 1000 :6f} ms") + check_error(lib.infiniopDestroyCausalSoftmaxDescriptor(descriptor)) -def test_cpu(lib, test_cases): +def test_cpu(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CPU handle = create_handle(lib, device) for x_shape, x_stride in test_cases: - test(lib, handle, "cpu", x_shape, x_stride) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "cpu", x_shape, x_stride, x_dtype=tensor_dtype) destroy_handle(lib, handle) -def test_cuda(lib, test_cases): +def test_cuda(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CUDA handle = create_handle(lib, device) for x_shape, x_stride in test_cases: - test(lib, handle, "cuda", x_shape, x_stride) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "cuda", x_shape, x_stride, x_dtype=tensor_dtype) destroy_handle(lib, handle) -def test_bang(lib, test_cases): +def test_bang(lib, test_cases, tensor_dtypes): import torch_mlu device = DeviceEnum.DEVICE_BANG handle = create_handle(lib, device) for x_shape, x_stride in test_cases: - test(lib, handle, "mlu", x_shape, x_stride) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "mlu", x_shape, x_stride, x_dtype=tensor_dtype) destroy_handle(lib, handle) -def test_ascend(lib, test_cases): +def test_ascend(lib, test_cases, tensor_dtypes): import torch_npu device = DeviceEnum.DEVICE_ASCEND handle = create_handle(lib, device) for x_shape, x_stride in test_cases: - test(lib, handle, "npu", x_shape, x_stride) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "npu", x_shape, x_stride, x_dtype=tensor_dtype) destroy_handle(lib, handle) @@ -117,6 +165,9 @@ def test_ascend(lib, test_cases): ((32, 20, 512), None), ((32, 20, 512), (20480, 512, 1)), # Ascend 暂不支持非连续 ] + tensor_dtypes = [ + torch.float16, + ] args = get_args() lib = open_lib() lib.infiniopCreateCausalSoftmaxDescriptor.restype = c_int32 @@ -143,14 +194,16 @@ def test_ascend(lib, test_cases): infiniopCausalSoftmaxDescriptor_t, ] + if args.profile: + PROFILE = True if args.cpu: - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) if args.cuda: - test_cuda(lib, test_cases) + test_cuda(lib, test_cases, tensor_dtypes) if args.bang: - test_bang(lib, test_cases) + test_bang(lib, test_cases, tensor_dtypes) if args.ascend: - test_ascend(lib, test_cases) + test_ascend(lib, test_cases, tensor_dtypes) if not (args.cpu or args.cuda or args.bang or args.ascend): - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/conv.py b/operatorspy/tests/conv.py index 7e7ea953..eac7869a 100644 --- a/operatorspy/tests/conv.py +++ b/operatorspy/tests/conv.py @@ -16,16 +16,13 @@ check_error, ) -from operatorspy.tests.test_utils import get_args +from operatorspy.tests.test_utils import get_args, synchronize_device import torch import math import ctypes from torch.nn import functional as F from typing import List, Tuple -# constant for control whether profile the pytorch and lib functions -# NOTE: need to manually add synchronization function to the lib function, -# e.g., cudaDeviceSynchronize() for CUDA PROFILE = False NUM_PRERUN = 10 NUM_ITERATIONS = 1000 @@ -108,14 +105,7 @@ def test( inferShape(x.shape, w.shape, pads, strides, dilations), dtype=tensor_dtype ).to(torch_device) - for i in range(NUM_PRERUN if PROFILE else 1): - ans = conv(x, w, strides, pads, dilations) - if PROFILE: - start_time = time.time() - for i in range(NUM_ITERATIONS): - _ = conv(x, w, strides, pads, dilations) - elapsed = (time.time() - start_time) / NUM_ITERATIONS - print(f"pytorch time: {elapsed :6f}") + ans = conv(x, w, strides, pads, dilations) x_tensor = to_tensor(x, lib) w_tensor = to_tensor(w, lib) @@ -148,21 +138,37 @@ def test( workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device) workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8)) - for i in range(NUM_PRERUN if PROFILE else 1): - check_error( - lib.infiniopConv( - descriptor, - workspace_ptr, - workspaceSize, - y_tensor.data, - x_tensor.data, - w_tensor.data, - None, - ) + check_error( + lib.infiniopConv( + descriptor, + workspace_ptr, + workspaceSize, + y_tensor.data, + x_tensor.data, + w_tensor.data, + None, ) + ) + + if (tensor_dtype == torch.float16): + assert torch.allclose(y, ans, atol=0, rtol=1e-2) + else: + assert torch.allclose(y, ans, atol=0, rtol=1e-3) + if PROFILE: + # Profiling PyTorch implementation + for i in range(NUM_PRERUN): + _ = conv(x, w, strides, pads, dilations) + synchronize_device(torch_device) start_time = time.time() for i in range(NUM_ITERATIONS): + _ = conv(x, w, strides, pads, dilations) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" pytorch time: {elapsed * 1000 :6f} ms") + + # Profiling C Operators implementation + for i in range(NUM_PRERUN): check_error( lib.infiniopConv( descriptor, @@ -174,42 +180,53 @@ def test( None, ) ) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopConv( + descriptor, + workspace_ptr, + workspaceSize, + y_tensor.data, + x_tensor.data, + w_tensor.data, + None, + ) + ) + synchronize_device(torch_device) elapsed = (time.time() - start_time) / NUM_ITERATIONS - print(f" lib time: {elapsed :6f}") + print(f" lib time: {elapsed * 1000 :6f} ms") - if (tensor_dtype == torch.float16): - assert torch.allclose(y, ans, atol=0, rtol=1e-2) - else: - assert torch.allclose(y, ans, atol=0, rtol=1e-3) check_error(lib.infiniopDestroyConvDescriptor(descriptor)) -def test_cpu(lib, test_cases): +def test_cpu(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CPU handle = create_handle(lib, device) for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases: - test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16) - test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=tensor_dtype) destroy_handle(lib, handle) -def test_cuda(lib, test_cases): +def test_cuda(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CUDA handle = create_handle(lib, device) for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases: - test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16) - test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=tensor_dtype) destroy_handle(lib, handle) -def test_bang(lib, test_cases): +def test_bang(lib, test_cases, tensor_dtypes): import torch_mlu device = DeviceEnum.DEVICE_BANG handle = create_handle(lib, device) for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases: - test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16) - test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=tensor_dtype) destroy_handle(lib, handle) @@ -257,6 +274,9 @@ def test_bang(lib, test_cases): None, ), ] + tensor_dtypes = [ + torch.float16, torch.float32, + ] args = get_args() lib = open_lib() lib.infiniopCreateConvDescriptor.restype = c_int32 @@ -286,12 +306,14 @@ def test_bang(lib, test_cases): infiniopConvDescriptor_t, ] + if args.profile: + PROFILE = True if args.cpu: - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) if args.cuda: - test_cuda(lib, test_cases) + test_cuda(lib, test_cases, tensor_dtypes) if args.bang: - test_bang(lib, test_cases) + test_bang(lib, test_cases, tensor_dtypes) if not (args.cpu or args.cuda or args.bang): - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/expand.py b/operatorspy/tests/expand.py index e060ad73..fc5535b5 100644 --- a/operatorspy/tests/expand.py +++ b/operatorspy/tests/expand.py @@ -17,12 +17,9 @@ rearrange_tensor, ) -from operatorspy.tests.test_utils import get_args +from operatorspy.tests.test_utils import get_args, synchronize_device import torch -# constant for control whether profile the pytorch and lib functions -# NOTE: need to manually add synchronization function to the lib function, -# e.g., cudaDeviceSynchronize() for CUDA PROFILE = False NUM_PRERUN = 10 NUM_ITERATIONS = 1000 @@ -36,11 +33,7 @@ class ExpandDescriptor(Structure): def expand(x, y): - if PROFILE: - ans = x.expand_as(y).clone() - torch.cuda.synchronize() - return ans - return x.expand_as(y) + return x.expand_as(y).clone() def test( @@ -65,14 +58,7 @@ def test( if y_stride is not None: y = rearrange_tensor(y, y_stride) - for i in range(NUM_PRERUN if PROFILE else 1): - ans = expand(x, y) - if PROFILE: - start_time = time.time() - for i in range(NUM_ITERATIONS): - _ = expand(x, y) - elapsed = (time.time() - start_time) / NUM_ITERATIONS - print(f"pytorch time: {elapsed :6f}") + ans = expand(x, y) x_tensor = to_tensor(x, lib) y_tensor = to_tensor(y, lib) @@ -91,46 +77,62 @@ def test( x_tensor.descriptor.contents.invalidate() y_tensor.descriptor.contents.invalidate() - for i in range(NUM_PRERUN if PROFILE else 1): - check_error(lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None)) + check_error(lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None)) + + assert torch.allclose(y, ans, atol=0, rtol=0) + if PROFILE: + # Profiling PyTorch implementation + for i in range(NUM_PRERUN): + _ = expand(x, y) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = expand(x, y) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" pytorch time: {elapsed * 1000 :6f} ms") + + # Profiling C Operators implementation + for i in range(NUM_PRERUN): + check_error(lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None)) + synchronize_device(torch_device) start_time = time.time() for i in range(NUM_ITERATIONS): - check_error( - lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None) - ) + check_error(lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None)) + synchronize_device(torch_device) elapsed = (time.time() - start_time) / NUM_ITERATIONS - print(f" lib time: {elapsed :6f}") - assert torch.allclose(y, ans, atol=0, rtol=1e-3) + print(f" lib time: {elapsed * 1000 :6f} ms") + check_error(lib.infiniopDestroyExpandDescriptor(descriptor)) -def test_cpu(lib, test_cases): +def test_cpu(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CPU handle = create_handle(lib, device) for y_shape, x_shape, y_stride, x_stride in test_cases: - test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16) - test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=tensor_dtype) destroy_handle(lib, handle) -def test_cuda(lib, test_cases): +def test_cuda(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CUDA handle = create_handle(lib, device) for y_shape, x_shape, y_stride, x_stride in test_cases: - test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16) - test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=tensor_dtype) destroy_handle(lib, handle) -def test_bang(lib, test_cases): +def test_bang(lib, test_cases, tensor_dtypes): import torch_mlu device = DeviceEnum.DEVICE_BANG handle = create_handle(lib, device) for y_shape, x_shape, y_stride, x_stride in test_cases: - test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16) - test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=tensor_dtype) destroy_handle(lib, handle) @@ -147,6 +149,9 @@ def test_bang(lib, test_cases): ((3, 2, 4, 5), (3, 2, 1, 1), None, None), ((32, 256, 112, 112), (32, 256, 112, 1), None, None), ] + tensor_dtypes = [ + torch.float16, torch.float32, + ] args = get_args() lib = open_lib() lib.infiniopCreateExpandDescriptor.restype = c_int32 @@ -168,12 +173,14 @@ def test_bang(lib, test_cases): infiniopExpandDescriptor_t, ] + if args.profile: + PROFILE = True if args.cpu: - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) if args.cuda: - test_cuda(lib, test_cases) + test_cuda(lib, test_cases, tensor_dtypes) if args.bang: - test_bang(lib, test_cases) + test_bang(lib, test_cases, tensor_dtypes) if not (args.cpu or args.cuda or args.bang): - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/gemm.py b/operatorspy/tests/gemm.py index 5da99eac..bee41b17 100644 --- a/operatorspy/tests/gemm.py +++ b/operatorspy/tests/gemm.py @@ -17,12 +17,9 @@ rearrange_tensor, ) -from operatorspy.tests.test_utils import get_args +from operatorspy.tests.test_utils import get_args, synchronize_device import torch -# constant for control whether profile the pytorch and lib functions -# NOTE: need to manually add synchronization function to the lib function, -# e.g., cudaDeviceSynchronize() for CUDA PROFILE = False NUM_PRERUN = 10 NUM_ITERATIONS = 1000 @@ -40,8 +37,6 @@ def gemm(A, B, C=None, transA=False, transB=False, alpha=1.0, beta=0.0, dtype=to result = alpha * torch.matmul(A if dtype != torch.float16 else A.to(torch.float32), B if dtype != torch.float16 else B.to(torch.float32)).to(dtype) if C is not None: result += beta * C if dtype != torch.float16 else C.to(torch.float32) - if PROFILE: - torch.cuda.synchronize() return result @@ -83,14 +78,7 @@ def test( if y_stride is not None: y = rearrange_tensor(y, y_stride) - for i in range(NUM_PRERUN if PROFILE else 1): - ans = gemm(a, b, c, transA, transB, alpha, beta, dtype) - if PROFILE: - start_time = time.time() - for i in range(NUM_ITERATIONS): - _ = gemm(a, b, c, transA, transB, alpha, beta, dtype) - elapsed = (time.time() - start_time) / NUM_ITERATIONS - print(f"pytorch time: {elapsed :6f}") + ans = gemm(a, b, c, transA, transB, alpha, beta, dtype) a_tensor = to_tensor(a, lib) b_tensor = to_tensor(b, lib) @@ -130,22 +118,35 @@ def test( ) workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8)) - for i in range(NUM_PRERUN if PROFILE else 1): - check_error( - lib.infiniopGEMM( - descriptor, - workspace_ptr, - workspace_size, - y_tensor.data, - a_tensor.data, - b_tensor.data, - c_tensor.data if c_tensor else None, - None, - ) + check_error( + lib.infiniopGEMM( + descriptor, + workspace_ptr, + workspace_size, + y_tensor.data, + a_tensor.data, + b_tensor.data, + c_tensor.data if c_tensor else None, + None, ) + ) + + assert torch.allclose(y, ans, atol=0, rtol=1e-2) + if PROFILE: + # Profiling PyTorch implementation + for i in range(NUM_PRERUN): + _ = gemm(a, b, c, transA, transB, alpha, beta, dtype) + synchronize_device(torch_device) start_time = time.time() for i in range(NUM_ITERATIONS): + _ = gemm(a, b, c, transA, transB, alpha, beta, dtype) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" pytorch time: {elapsed * 1000 :6f} ms") + + # Profiling C Operators implementation + for i in range(NUM_PRERUN): check_error( lib.infiniopGEMM( descriptor, @@ -158,14 +159,29 @@ def test( None, ) ) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopGEMM( + descriptor, + workspace_ptr, + workspace_size, + y_tensor.data, + a_tensor.data, + b_tensor.data, + c_tensor.data if c_tensor else None, + None, + ) + ) + synchronize_device(torch_device) elapsed = (time.time() - start_time) / NUM_ITERATIONS - print(f" lib time: {elapsed :6f}") + print(f" lib time: {elapsed * 1000 :6f} ms") - assert torch.allclose(y, ans, atol=0, rtol=1e-2) check_error(lib.infiniopDestroyGEMMDescriptor(descriptor)) -def test_cpu(lib, test_cases): +def test_cpu(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CPU handle = create_handle(lib, device) for ( @@ -182,12 +198,12 @@ def test_cpu(lib, test_cases): c_stride, y_stride, ) in test_cases: - test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16) - test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=tensor_dtype) destroy_handle(lib, handle) -def test_cuda(lib, test_cases): +def test_cuda(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CUDA handle = create_handle(lib, device) for ( @@ -204,12 +220,12 @@ def test_cuda(lib, test_cases): c_stride, y_stride, ) in test_cases: - test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16) - test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=tensor_dtype) destroy_handle(lib, handle) -def test_bang(lib, test_cases): +def test_bang(lib, test_cases, tensor_dtypes): import torch_mlu device = DeviceEnum.DEVICE_BANG @@ -229,8 +245,8 @@ def test_bang(lib, test_cases): c_stride, y_stride, ) in test_cases: - test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16) - test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=tensor_dtype) destroy_handle(lib, handle) @@ -323,6 +339,9 @@ def test_bang(lib, test_cases): (4096, 1), ), ] + tensor_dtypes = [ + torch.float16, torch.float32, + ] args = get_args() lib = open_lib() @@ -363,12 +382,14 @@ def test_bang(lib, test_cases): infiniopGEMMDescriptor_t, ] + if args.profile: + PROFILE = True if args.cpu: - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) if args.cuda: - test_cuda(lib, test_cases) + test_cuda(lib, test_cases, tensor_dtypes) if args.bang: - test_bang(lib, test_cases) + test_bang(lib, test_cases, tensor_dtypes) if not (args.cpu or args.cuda or args.bang): - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/global_avg_pool.py b/operatorspy/tests/global_avg_pool.py index 33f7b64d..b1907162 100644 --- a/operatorspy/tests/global_avg_pool.py +++ b/operatorspy/tests/global_avg_pool.py @@ -16,12 +16,9 @@ check_error, ) -from operatorspy.tests.test_utils import get_args -import torch, time +from operatorspy.tests.test_utils import get_args, synchronize_device +import torch -# constant for control whether profile the pytorch and lib functions -# NOTE: need to manually add synchronization function to the lib function, -# e.g., cudaDeviceSynchronize() for CUDA PROFILE = False NUM_PRERUN = 10 NUM_ITERATIONS = 1000 @@ -40,8 +37,6 @@ def inferShape(x): def globalAvgPool(x): y = torch.mean(x, dim=tuple(range(2, x.dim())), keepdim=True) - if PROFILE: - torch.cuda.synchronize() return y.view(*inferShape(x)) @@ -59,14 +54,7 @@ def test( x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device) y = torch.zeros(inferShape(x), dtype=tensor_dtype).to(torch_device) - for i in range(NUM_PRERUN if PROFILE else 1): - ans = globalAvgPool(x) - if PROFILE: - start_time = time.time() - for i in range(NUM_ITERATIONS): - _ = globalAvgPool(x) - elapsed = (time.time() - start_time) / NUM_ITERATIONS - print(f"pytorch time: {elapsed :6f}") + ans = globalAvgPool(x) x_tensor = to_tensor(x, lib) y_tensor = to_tensor(y, lib) @@ -96,58 +84,77 @@ def test( ) workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8)) - for i in range(NUM_PRERUN if PROFILE else 1): - check_error( - lib.infiniopGlobalAvgPool( - descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None - ) + check_error( + lib.infiniopGlobalAvgPool( + descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None ) + ) + + if (tensor_dtype == torch.float16): + assert torch.allclose(y, ans, atol=0, rtol=1e-3) + else: + assert torch.allclose(y, ans, atol=1e-6, rtol=1e-4) + if PROFILE: + # Profiling PyTorch implementation + for i in range(NUM_PRERUN): + _ = globalAvgPool(x) + synchronize_device(torch_device) start_time = time.time() for i in range(NUM_ITERATIONS): + _ = globalAvgPool(x) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" pytorch time: {elapsed * 1000 :6f} ms") + + # Profiling C Operators implementation + for i in range(NUM_PRERUN): check_error( lib.infiniopGlobalAvgPool( - descriptor, - workspace_ptr, - workspaceSize, - y_tensor.data, - x_tensor.data, - None, + descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None ) ) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopGlobalAvgPool( + descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None + ) + ) + synchronize_device(torch_device) elapsed = (time.time() - start_time) / NUM_ITERATIONS - print(f" lib time: {elapsed :6f}") + print(f" lib time: {elapsed * 1000 :6f} ms") - assert torch.allclose(y, ans, atol=0, rtol=1e-3) check_error(lib.infiniopDestroyGlobalAvgPoolDescriptor(descriptor)) -def test_cpu(lib, test_cases): +def test_cpu(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CPU handle = create_handle(lib, device) for x_shape in test_cases: - test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float16) - test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float32) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "cpu", x_shape, tensor_dtype=tensor_dtype) destroy_handle(lib, handle) -def test_cuda(lib, test_cases): +def test_cuda(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CUDA handle = create_handle(lib, device) for x_shape in test_cases: - test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float16) - test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float32) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "cuda", x_shape, tensor_dtype=tensor_dtype) destroy_handle(lib, handle) -def test_bang(lib, test_cases): +def test_bang(lib, test_cases, tensor_dtypes): import torch_mlu device = DeviceEnum.DEVICE_BANG handle = create_handle(lib, device) for x_shape in test_cases: - test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float16) - test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float32) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "mlu", x_shape, tensor_dtype=tensor_dtype) destroy_handle(lib, handle) @@ -169,6 +176,9 @@ def test_bang(lib, test_cases): ((32, 256, 1, 112, 112)), ((32, 256, 112, 112)), ] + tensor_dtypes = [ + torch.float16, torch.float32, + ] args = get_args() lib = open_lib() lib.infiniopCreateGlobalAvgPoolDescriptor.restype = c_int32 @@ -197,12 +207,14 @@ def test_bang(lib, test_cases): infiniopGlobalAvgPoolDescriptor_t, ] + if args.profile: + PROFILE = True if args.cpu: - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) if args.cuda: - test_cuda(lib, test_cases) + test_cuda(lib, test_cases, tensor_dtypes) if args.bang: - test_bang(lib, test_cases) + test_bang(lib, test_cases, tensor_dtypes) if not (args.cpu or args.cuda or args.bang): - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/max_pool.py b/operatorspy/tests/max_pool.py index ffc0bb19..abd1eddb 100644 --- a/operatorspy/tests/max_pool.py +++ b/operatorspy/tests/max_pool.py @@ -16,13 +16,10 @@ check_error, ) -from operatorspy.tests.test_utils import get_args +from operatorspy.tests.test_utils import get_args, synchronize_device import torch from typing import Tuple -# constant for control whether profile the pytorch and lib functions -# NOTE: need to manually add synchronization function to the lib function, -# e.g., cudaDeviceSynchronize() for CUDA PROFILE = False NUM_PRERUN = 10 NUM_ITERATIONS = 1000 @@ -48,8 +45,6 @@ def pool(x, k, padding, stride, dilation = 1): return None ans = pooling_layers[ndim](k, stride=stride, padding=padding, dilation=dilation)(x) - if PROFILE: - torch.cuda.synchronize() return ans @@ -89,14 +84,7 @@ def test( x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device) y = torch.rand(inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype).to(torch_device) - for i in range(NUM_PRERUN if PROFILE else 1): - ans = pool(x, k_shape, padding, strides) - if PROFILE: - start_time = time.time() - for i in range(NUM_ITERATIONS): - _ = pool(x, k_shape, padding, strides) - elapsed = (time.time() - start_time) / NUM_ITERATIONS - print(f"pytorch time: {elapsed :6f}") + ans = pool(x, k_shape, padding, strides) x_tensor = to_tensor(x, lib) y_tensor = to_tensor(y, lib) @@ -126,18 +114,47 @@ def test( workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device) workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8)) - for i in range(NUM_PRERUN if PROFILE else 1): - check_error( - lib.infiniopMaxPool( - descriptor, - workspace_ptr, - workspaceSize, - y_tensor.data, - x_tensor.data, - None, - ) + check_error( + lib.infiniopMaxPool( + descriptor, + workspace_ptr, + workspaceSize, + y_tensor.data, + x_tensor.data, + None, ) + ) + + if (tensor_dtype == torch.float16): + assert torch.allclose(y, ans, atol=0, rtol=1e-3) + else: + assert torch.allclose(y, ans, atol=1e-6, rtol=1e-5) + if PROFILE: + # Profiling PyTorch implementation + for i in range(NUM_PRERUN): + _ = pool(x, k_shape, padding, strides) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = pool(x, k_shape, padding, strides) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" pytorch time: {elapsed * 1000 :6f} ms") + + # Profiling C Operators implementation + for i in range(NUM_PRERUN): + check_error( + lib.infiniopMaxPool( + descriptor, + workspace_ptr, + workspaceSize, + y_tensor.data, + x_tensor.data, + None, + ) + ) + synchronize_device(torch_device) start_time = time.time() for i in range(NUM_ITERATIONS): check_error( @@ -150,39 +167,39 @@ def test( None, ) ) + synchronize_device(torch_device) elapsed = (time.time() - start_time) / NUM_ITERATIONS - print(f" lib time: {elapsed :6f}") + print(f" lib time: {elapsed * 1000 :6f} ms") - assert torch.allclose(y, ans, atol=0, rtol=1e-3) check_error(lib.infiniopDestroyMaxPoolDescriptor(descriptor)) -def test_cpu(lib, test_cases): +def test_cpu(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CPU handle = create_handle(lib, device) for x_shape, kernel_shape, padding, strides in test_cases: - test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16) - test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=tensor_dtype) destroy_handle(lib, handle) -def test_cuda(lib, test_cases): +def test_cuda(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CUDA handle = create_handle(lib, device) for x_shape, kernel_shape, padding, strides in test_cases: - test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16) - test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=tensor_dtype) destroy_handle(lib, handle) -def test_bang(lib, test_cases): +def test_bang(lib, test_cases, tensor_dtypes): import torch_mlu device = DeviceEnum.DEVICE_BANG handle = create_handle(lib, device) for x_shape, kernel_shape, padding, strides in test_cases: - test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16) - test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=tensor_dtype) destroy_handle(lib, handle) @@ -193,6 +210,9 @@ def test_bang(lib, test_cases): ((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)), ((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)), ] + tensor_dtypes = [ + torch.float16, torch.float32, + ] args = get_args() lib = open_lib() lib.infiniopCreateMaxPoolDescriptor.restype = c_int32 @@ -225,12 +245,14 @@ def test_bang(lib, test_cases): infiniopMaxPoolDescriptor_t, ] + if args.profile: + PROFILE = True if args.cpu: - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) if args.cuda: - test_cuda(lib, test_cases) + test_cuda(lib, test_cases, tensor_dtypes) if args.bang: - test_bang(lib, test_cases) + test_bang(lib, test_cases, tensor_dtypes) if not (args.cpu or args.cuda or args.bang): - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/mlp.py b/operatorspy/tests/mlp.py index 668d7861..b4bdfcb8 100644 --- a/operatorspy/tests/mlp.py +++ b/operatorspy/tests/mlp.py @@ -2,6 +2,7 @@ import ctypes import sys import os +import time sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) from operatorspy import ( @@ -18,10 +19,13 @@ create_workspace, ) -from operatorspy.tests.test_utils import get_args +from operatorspy.tests.test_utils import get_args, synchronize_device import torch import torch.nn as nn +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 class MLPDescriptor(Structure): _fields_ = [("device", c_int32)] @@ -137,10 +141,55 @@ def test( ) assert torch.allclose(y, ans, atol=0, rtol=2e-2) + if PROFILE: + # Profiling PyTorch implementation + for i in range(NUM_PRERUN): + _ = mlp(y, x, w12, w3, alpha, residual) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = mlp(y, x, w12, w3, alpha, residual) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" pytorch time: {elapsed * 1000 :6f} ms") + + # Profiling C Operators implementation + for i in range(NUM_PRERUN): + check_error( + lib.infiniopMLP( + descriptor, + workspace.data_ptr() if workspace is not None else None, + workspace_size.value, + y_tensor.data, + x_tensor.data, + w12_tensor.data, + w3_tensor.data, + None, + ) + ) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopMLP( + descriptor, + workspace.data_ptr() if workspace is not None else None, + workspace_size.value, + y_tensor.data, + x_tensor.data, + w12_tensor.data, + w3_tensor.data, + None, + ) + ) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed * 1000 :6f} ms") + check_error(lib.infiniopDestroyMLPDescriptor(descriptor)) -def test_cpu(lib, test_cases): +def test_cpu(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CPU handle = create_handle(lib, device) @@ -150,32 +199,32 @@ def test_cpu(lib, test_cases): intermediate_size, alpha, residual, - dtype, x_stride, y_stride, w12_stride, w3_stride, ) in test_cases: - test( - lib, - handle, - "cpu", - num_tokens, - hidden_size, - intermediate_size, - alpha, - residual, - dtype, - x_stride, - y_stride, - w12_stride, - w3_stride, - ) + for tensor_dtype in tensor_dtypes: + test( + lib, + handle, + "cpu", + num_tokens, + hidden_size, + intermediate_size, + alpha, + residual, + tensor_dtype, + x_stride, + y_stride, + w12_stride, + w3_stride, + ) destroy_handle(lib, handle) -def test_cuda(lib, test_cases): +def test_cuda(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CUDA handle = create_handle(lib, device) @@ -185,32 +234,32 @@ def test_cuda(lib, test_cases): intermediate_size, alpha, residual, - dtype, x_stride, y_stride, w12_stride, w3_stride, ) in test_cases: - test( - lib, - handle, - "cuda", - num_tokens, - hidden_size, - intermediate_size, - alpha, - residual, - dtype, - x_stride, - y_stride, - w12_stride, - w3_stride, - ) + for tensor_dtype in tensor_dtypes: + test( + lib, + handle, + "cuda", + num_tokens, + hidden_size, + intermediate_size, + alpha, + residual, + tensor_dtype, + x_stride, + y_stride, + w12_stride, + w3_stride, + ) destroy_handle(lib, handle) -def test_bang(lib, test_cases): +def test_bang(lib, test_cases, tensor_dtypes): import torch_mlu device = DeviceEnum.DEVICE_BANG @@ -222,27 +271,27 @@ def test_bang(lib, test_cases): intermediate_size, alpha, residual, - dtype, x_stride, y_stride, w12_stride, w3_stride, ) in test_cases: - test( - lib, - handle, - "mlu", - num_tokens, - hidden_size, - intermediate_size, - alpha, - residual, - dtype, - x_stride, - y_stride, - w12_stride, - w3_stride, - ) + for tensor_dtype in tensor_dtypes: + test( + lib, + handle, + "mlu", + num_tokens, + hidden_size, + intermediate_size, + alpha, + residual, + tensor_dtype, + x_stride, + y_stride, + w12_stride, + w3_stride, + ) destroy_handle(lib, handle) @@ -250,22 +299,24 @@ def test_bang(lib, test_cases): if __name__ == "__main__": test_cases = [ # num_tokens, hidden_size, intermediate_size, alpha, residual, dtype, x_stride, y_stride, w12_stride, w3_stride - (4, 4096, 11008, 1.0, True, torch.float16, None, None, None, None), - (4, 4096, 11008, 1.0, True, torch.float16, [8192, 1], [8192, 1], None, None), + (4, 4096, 11008, 1.0, True, None, None, None, None), + (4, 4096, 11008, 1.0, True, [8192, 1], [8192, 1], None, None), ( 4, 4096, 11008, 1.0, True, - torch.float16, None, None, [1, 4096], [1, 11008], ), - (4, 4096, 11008, 1.0, False, torch.float16, None, None, None, None), - (4, 4096, 11008, 1.0, False, torch.float16, [8192, 1], [8192, 1], None, None), + (4, 4096, 11008, 1.0, False, None, None, None, None), + (4, 4096, 11008, 1.0, False, [8192, 1], [8192, 1], None, None), + ] + tensor_dtypes = [ + torch.float16, ] args = get_args() lib = open_lib() @@ -305,12 +356,14 @@ def test_bang(lib, test_cases): infiniopMLPDescriptor_t, ] + if args.profile: + PROFILE = True if args.cpu: - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) if args.cuda: - test_cuda(lib, test_cases) + test_cuda(lib, test_cases, tensor_dtypes) if args.bang: - test_bang(lib, test_cases) + test_bang(lib, test_cases, tensor_dtypes) if not (args.cpu or args.cuda or args.bang): - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/random_sample.py b/operatorspy/tests/random_sample.py index 98a8dceb..8693d807 100644 --- a/operatorspy/tests/random_sample.py +++ b/operatorspy/tests/random_sample.py @@ -2,6 +2,7 @@ import ctypes import sys import os +import time sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) from operatorspy import ( @@ -18,9 +19,12 @@ U64, ) -from operatorspy.tests.test_utils import get_args +from operatorspy.tests.test_utils import get_args, synchronize_device import torch +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 class RandomSampleDescriptor(Structure): _fields_ = [("device", c_int32)] @@ -129,40 +133,100 @@ def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_ torch.npu.synchronize() assert indices[0].type(ans.dtype) == ans or data[ans] == data[indices[0]] + + if PROFILE: + # Profiling PyTorch implementation + for i in range(NUM_PRERUN): + if(topp > 0 and topk > 1): + _ = random_sample(data.to("cpu"), random_val, topp, topk, voc, temperature, "cpu") + else: + _ = random_sample_0(data) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + if(topp > 0 and topk > 1): + _ = random_sample(data.to("cpu"), random_val, topp, topk, voc, temperature, "cpu") + else: + _ = random_sample_0(data) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" pytorch time: {elapsed * 1000 :6f} ms") + + # Profiling C Operators implementation + for i in range(NUM_PRERUN): + check_error( + lib.infiniopRandomSample( + descriptor, + workspace.data_ptr() if workspace is not None else None, + workspace_size.value, + indices_tensor.data, + x_tensor.data, + random_val, + topp, + topk, + temperature, + None, + ) + ) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopRandomSample( + descriptor, + workspace.data_ptr() if workspace is not None else None, + workspace_size.value, + indices_tensor.data, + x_tensor.data, + random_val, + topp, + topk, + temperature, + None, + ) + ) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed * 1000 :6f} ms") + check_error(lib.infiniopDestroyRandomSampleDescriptor(descriptor)) -def test_cpu(lib, test_cases): +def test_cpu(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CPU handle = create_handle(lib, device) for (voc, random_val, topp, topk, temperature) in test_cases: - test(lib, handle, "cpu", voc, random_val, topp, topk, temperature) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "cpu", voc, random_val, topp, topk, temperature, x_dtype=tensor_dtype) destroy_handle(lib, handle) -def test_cuda(lib, test_cases): +def test_cuda(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CUDA handle = create_handle(lib, device) for (voc, random_val, topp, topk, temperature) in test_cases: - test(lib, handle, "cuda", voc, random_val, topp, topk, temperature) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "cuda", voc, random_val, topp, topk, temperature, x_dtype=tensor_dtype) destroy_handle(lib, handle) -def test_bang(lib, test_cases): +def test_bang(lib, test_cases, tensor_dtypes): import torch_mlu device = DeviceEnum.DEVICE_BANG handle = create_handle(lib, device) for (voc, random_val, topp, topk, temperature) in test_cases: - test(lib, handle, "mlu", voc, random_val, topp, topk, temperature) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "mlu", voc, random_val, topp, topk, temperature, x_dtype=tensor_dtype) destroy_handle(lib, handle) -def test_ascend(lib, test_cases): +def test_ascend(lib, test_cases, tensor_dtypes): import torch_npu device = DeviceEnum.DEVICE_ASCEND handle = create_handle(lib, device) for (voc, random_val, topp, topk, temperature) in test_cases: - test(lib, handle, "npu", voc, random_val, topp, topk, temperature) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "npu", voc, random_val, topp, topk, temperature, x_dtype=tensor_dtype) destroy_handle(lib, handle) @@ -180,6 +244,9 @@ def test_ascend(lib, test_cases): (32000, 0.08, 1.0, 25, 1.0), # (119696, 0.01, 1.0, 100, 1.0), ] + tensor_dtypes = [ + torch.float16, + ] args = get_args() lib = open_lib() @@ -212,14 +279,16 @@ def test_ascend(lib, test_cases): infiniopRandomSampleDescriptor_t, ] + if args.profile: + PROFILE = True if args.cpu: - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) if args.cuda: - test_cuda(lib, test_cases) + test_cuda(lib, test_cases, tensor_dtypes) if args.bang: - test_bang(lib, test_cases) + test_bang(lib, test_cases, tensor_dtypes) if args.ascend: - test_ascend(lib, test_cases) + test_ascend(lib, test_cases, tensor_dtypes) if not (args.cpu or args.cuda or args.bang or args.ascend): - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/rearrange.py b/operatorspy/tests/rearrange.py index e9cc81b9..8902633c 100644 --- a/operatorspy/tests/rearrange.py +++ b/operatorspy/tests/rearrange.py @@ -2,6 +2,7 @@ from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p import sys import os +import time sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) from operatorspy import ( @@ -17,9 +18,12 @@ rearrange_tensor, ) -from operatorspy.tests.test_utils import get_args +from operatorspy.tests.test_utils import get_args, synchronize_device import torch +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 class RerrangeDescriptor(Structure): _fields_ = [("device", c_int32)] @@ -28,6 +32,12 @@ class RerrangeDescriptor(Structure): infiniopRearrangeDescriptor_t = POINTER(RerrangeDescriptor) +def rearrange(x, x_shape, y_strides): + y_ = x.clone() + y_.set_(y_.untyped_storage(), 0, x_shape, y_strides) + y_[:] = x.view_as(y_) + return y_ + def test( lib, handle, @@ -47,6 +57,9 @@ def test( x = rearrange_tensor(x, x_stride) if y_stride is not None: y = rearrange_tensor(y, y_stride) + + ans = rearrange(x, x_shape, y_stride) + x_tensor = to_tensor(x, lib) y_tensor = to_tensor(y, lib) @@ -64,40 +77,72 @@ def test( check_error( lib.infiniopRearrange(descriptor, y_tensor.data, x_tensor.data, None) ) - assert torch.allclose(x, y, atol=0, rtol=1e-3) + + assert torch.allclose(x, ans, atol=0, rtol=1e-3) + + if PROFILE: + # Profiling PyTorch implementation + for i in range(NUM_PRERUN): + _ = rearrange(x, x_shape, y_stride) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = rearrange(x, x_shape, y_stride) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" pytorch time: {elapsed * 1000 :6f} ms") + + # Profiling C Operators implementation + for i in range(NUM_PRERUN): + check_error( + lib.infiniopRearrange(descriptor, y_tensor.data, x_tensor.data, None) + ) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopRearrange(descriptor, y_tensor.data, x_tensor.data, None) + ) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed * 1000 :6f} ms") + check_error(lib.infiniopDestroyRearrangeDescriptor(descriptor)) -def test_cpu(lib, test_cases): +def test_cpu(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CPU handle = create_handle(lib, device) for test_case in test_cases: x_shape, x_stride = test_case[0] y_shape, y_stride = test_case[1] - test(lib, handle, "cpu", x_shape, x_stride, y_shape, y_stride) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "cpu", x_shape, x_stride, y_shape, y_stride, tensor_dtype) destroy_handle(lib, handle) -def test_cuda(lib, test_cases): +def test_cuda(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CUDA handle = create_handle(lib, device) for test_case in test_cases: x_shape, x_stride = test_case[0] y_shape, y_stride = test_case[1] - test(lib, handle, "cuda", x_shape, x_stride, y_shape, y_stride) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "cuda", x_shape, x_stride, y_shape, y_stride, tensor_dtype) destroy_handle(lib, handle) -def test_bang(lib, test_cases): +def test_bang(lib, test_cases, tensor_dtypes): import torch_mlu device = DeviceEnum.DEVICE_BANG handle = create_handle(lib, device) for test_case in test_cases: x_shape, x_stride = test_case[0] y_shape, y_stride = test_case[1] - test(lib, handle, "mlu", x_shape, x_stride, y_shape, y_stride) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "mlu", x_shape, x_stride, y_shape, y_stride, tensor_dtype) destroy_handle(lib, handle) -def test_ascend(lib, test_cases): +def test_ascend(lib, test_cases, tensor_dtypes): import torch_npu device = DeviceEnum.DEVICE_ASCEND @@ -105,7 +150,8 @@ def test_ascend(lib, test_cases): for test_case in test_cases: x_shape, x_stride = test_case[0] y_shape, y_stride = test_case[1] - test(lib, handle, "npu", x_shape, x_stride, y_shape, y_stride) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "npu", x_shape, x_stride, y_shape, y_stride, tensor_dtype) destroy_handle(lib, handle) if __name__ == "__main__": @@ -120,6 +166,9 @@ def test_ascend(lib, test_cases): (((4, 1, 64), (64, 2560, 1)), ((4, 1, 64), (64, 11264, 1))), (((64,), (1,)), ((64,), (1,))), ] + tensor_dtypes = [ + torch.float16, + ] lib = open_lib() lib.infiniopCreateRearrangeDescriptor.restype = c_int32 lib.infiniopCreateRearrangeDescriptor.argtypes = [ @@ -137,12 +186,15 @@ def test_ascend(lib, test_cases): ] lib.infiniopDestroyRearrangeDescriptor.restype = c_int32 lib.infiniopDestroyRearrangeDescriptor.argtypes = [infiniopRearrangeDescriptor_t] + + if args.profile: + PROFILE = True if args.cpu: - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) if args.cuda: - test_cuda(lib, test_cases) + test_cuda(lib, test_cases, tensor_dtypes) if args.bang: - test_bang(lib, test_cases) + test_bang(lib, test_cases, tensor_dtypes) if args.ascend: - test_ascend(lib, test_cases) + test_ascend(lib, test_cases, tensor_dtypes) print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/relu.py b/operatorspy/tests/relu.py index b7f76627..5940468f 100644 --- a/operatorspy/tests/relu.py +++ b/operatorspy/tests/relu.py @@ -16,13 +16,10 @@ check_error, ) -from operatorspy.tests.test_utils import get_args +from operatorspy.tests.test_utils import get_args, synchronize_device from enum import Enum, auto import torch -# constant for control whether profile the pytorch and lib functions -# NOTE: need to manually add synchronization function to the lib function, -# e.g., cudaDeviceSynchronize() for CUDA PROFILE = False NUM_PRERUN = 10 NUM_ITERATIONS = 1000 @@ -41,10 +38,6 @@ class ReluDescriptor(Structure): def relu(x): - if PROFILE: - ans = torch.nn.functional.relu(x).to(x.dtype) - torch.cuda.synchronize() - return ans return torch.nn.functional.relu(x).to(x.dtype) @@ -63,14 +56,7 @@ def test( x = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1 y = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else x - for i in range(NUM_PRERUN if PROFILE else 1): - ans = relu(x) - if PROFILE: - start_time = time.time() - for i in range(NUM_ITERATIONS): - _ = relu(x) - elapsed = (time.time() - start_time) / NUM_ITERATIONS - print(f"pytorch time: {elapsed :6f}") + ans = relu(x) x_tensor = to_tensor(x, lib) y_tensor = to_tensor(y, lib) if inplace == Inplace.OUT_OF_PLACE else x_tensor @@ -89,47 +75,62 @@ def test( x_tensor.descriptor.contents.invalidate() y_tensor.descriptor.contents.invalidate() - for i in range(NUM_PRERUN if PROFILE else 1): - check_error(lib.infiniopRelu(descriptor, y_tensor.data, x_tensor.data, None)) + check_error(lib.infiniopRelu(descriptor, y_tensor.data, x_tensor.data, None)) + + assert torch.allclose(y, ans, atol=0, rtol=0) + if PROFILE: + # Profiling PyTorch implementation + for i in range(NUM_PRERUN): + _ = relu(x) + synchronize_device(torch_device) start_time = time.time() for i in range(NUM_ITERATIONS): - check_error( - lib.infiniopRelu(descriptor, y_tensor.data, x_tensor.data, None) - ) + _ = relu(x) + synchronize_device(torch_device) elapsed = (time.time() - start_time) / NUM_ITERATIONS - print(f" lib time: {elapsed :6f}") + print(f" pytorch time: {elapsed * 1000 :6f} ms") + + # Profiling C Operators implementation + for i in range(NUM_PRERUN): + check_error(lib.infiniopRelu(descriptor, y_tensor.data, x_tensor.data, None)) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error(lib.infiniopRelu(descriptor, y_tensor.data, x_tensor.data, None)) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed * 1000 :6f} ms") - assert torch.allclose(y, ans, atol=0, rtol=1e-3) check_error(lib.infiniopDestroyReluDescriptor(descriptor)) -def test_cpu(lib, test_cases): +def test_cpu(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CPU handle = create_handle(lib, device) for tensor_shape, inplace in test_cases: - test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float16, inplace=inplace) - test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float32, inplace=inplace) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "cpu", tensor_shape, tensor_dtype=tensor_dtype, inplace=inplace) destroy_handle(lib, handle) -def test_cuda(lib, test_cases): +def test_cuda(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CUDA handle = create_handle(lib, device) for tensor_shape, inplace in test_cases: - test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float16, inplace=inplace) - test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float32, inplace=inplace) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "cuda", tensor_shape, tensor_dtype=tensor_dtype, inplace=inplace) destroy_handle(lib, handle) -def test_bang(lib, test_cases): +def test_bang(lib, test_cases, tensor_dtypes): import torch_mlu device = DeviceEnum.DEVICE_BANG handle = create_handle(lib, device) for tensor_shape, inplace in test_cases: - test(lib, handle, "mlu", tensor_shape, tensor_dtype=torch.float16, inplace=inplace) - test(lib, handle, "mlu", tensor_shape, tensor_dtype=torch.float32, inplace=inplace) + for tensor_dtype in tensor_dtypes: + test(lib, handle, "mlu", tensor_shape, tensor_dtype=tensor_dtype, inplace=inplace) destroy_handle(lib, handle) @@ -145,6 +146,9 @@ def test_bang(lib, test_cases): ((33, 333, 333), Inplace.OUT_OF_PLACE), ((32, 256, 112, 112), Inplace.OUT_OF_PLACE), ] + tensor_dtypes = [ + torch.float16, torch.float32, + ] args = get_args() lib = open_lib() lib.infiniopCreateReluDescriptor.restype = c_int32 @@ -166,12 +170,14 @@ def test_bang(lib, test_cases): infiniopReluDescriptor_t, ] + if args.profile: + PROFILE = True if args.cpu: - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) if args.cuda: - test_cuda(lib, test_cases) + test_cuda(lib, test_cases, tensor_dtypes) if args.bang: - test_bang(lib, test_cases) + test_bang(lib, test_cases, tensor_dtypes) if not (args.cpu or args.cuda or args.bang): - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/rms_norm.py b/operatorspy/tests/rms_norm.py index 13cf1ccf..3845e7be 100644 --- a/operatorspy/tests/rms_norm.py +++ b/operatorspy/tests/rms_norm.py @@ -2,6 +2,7 @@ import ctypes import sys import os +import time sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) from operatorspy import ( @@ -17,9 +18,13 @@ create_workspace, ) -from operatorspy.tests.test_utils import get_args +from operatorspy.tests.test_utils import get_args, synchronize_device import torch +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + class RMSNormDescriptor(Structure): _fields_ = [("device", c_int32)] @@ -84,44 +89,102 @@ def test(lib, handle, torch_device, y_shape, x_shape, w_shape, dtype=torch.float ) assert torch.allclose(y.to(dtype), ans.to(dtype), atol=1e-3, rtol=1e-3) + + if PROFILE: + # Profiling PyTorch implementation + for i in range(NUM_PRERUN): + _ = rms_norm(x, w, eps) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = rms_norm(x, w, eps) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" pytorch time: {elapsed * 1000 :6f} ms") + + # Profiling C Operators implementation + for i in range(NUM_PRERUN): + check_error( + lib.infiniopRMSNorm( + descriptor, + workspace.data_ptr() if workspace is not None else None, + workspace_size.value, + y_tensor.data, + x_tensor.data, + w_tensor.data, + None, + ) + ) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopRMSNorm( + descriptor, + workspace.data_ptr() if workspace is not None else None, + workspace_size.value, + y_tensor.data, + x_tensor.data, + w_tensor.data, + None, + ) + ) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed * 1000 :6f} ms") + check_error(lib.infiniopDestroyRMSNormDescriptor(descriptor)) -def test_cpu(lib, test_cases): +def test_cpu(lib, test_cases, tensor_dtypes, w_dtypes): device = DeviceEnum.DEVICE_CPU handle = create_handle(lib, device) - for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases: - test(lib, handle, "cpu", y_shape, x_shape, w_shape, dtype, w_dtype) + for (y_shape, x_shape, w_shape) in test_cases: + for tensor_dtype in tensor_dtypes: + for w_dtype in w_dtypes: + test(lib, handle, "cpu", y_shape, x_shape, w_shape, tensor_dtype, w_dtype) destroy_handle(lib, handle) -def test_cuda(lib, test_cases): +def test_cuda(lib, test_cases, tensor_dtypes, w_dtypes): device = DeviceEnum.DEVICE_CUDA handle = create_handle(lib, device) - for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases: - test(lib, handle, "cuda", y_shape, x_shape, w_shape, dtype, w_dtype) + for (y_shape, x_shape, w_shape) in test_cases: + for tensor_dtype in tensor_dtypes: + for w_dtype in w_dtypes: + test(lib, handle, "cuda", y_shape, x_shape, w_shape, tensor_dtype, w_dtype) destroy_handle(lib, handle) -def test_bang(lib, test_cases): +def test_bang(lib, test_cases, tensor_dtypes, w_dtypes): import torch_mlu device = DeviceEnum.DEVICE_BANG handle = create_handle(lib, device) - for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases: - test(lib, handle, "mlu", y_shape, x_shape, w_shape, dtype, w_dtype) + for (y_shape, x_shape, w_shape) in test_cases: + for tensor_dtype in tensor_dtypes: + for w_dtype in w_dtypes: + test(lib, handle, "mlu", y_shape, x_shape, w_shape, tensor_dtype, w_dtype) destroy_handle(lib, handle) -def test_ascend(lib, test_cases): +def test_ascend(lib, test_cases, tensor_dtypes, w_dtypes): import torch_npu device = DeviceEnum.DEVICE_ASCEND handle = create_handle(lib, device) - for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases: - test(lib, handle, "npu", y_shape, x_shape, w_shape, dtype, w_dtype) + for (y_shape, x_shape, w_shape) in test_cases: + for tensor_dtype in tensor_dtypes: + for w_dtype in w_dtypes: + test(lib, handle, "npu", y_shape, x_shape, w_shape, tensor_dtype, w_dtype) destroy_handle(lib, handle) if __name__ == "__main__": test_cases = [ - # y_shape, x_shape, w_shape, dtype, w_dtype - ((16, 2048), (16, 2048), (2048,), torch.float16, torch.float16), - ((16, 2048), (16, 2048), (2048,), torch.float16, torch.float32), + # y_shape, x_shape, w_shape, + ((16, 2048), (16, 2048), (2048,)), + ((32, 4096), (32, 4096), (4096,)), + ] + tensor_dtypes = [ + torch.float16, + ] + w_dtypes = [ + torch.float16, torch.float32, ] args = get_args() lib = open_lib() @@ -156,14 +219,16 @@ def test_ascend(lib, test_cases): infiniopRMSNormDescriptor_t, ] + if args.profile: + PROFILE = True if args.cpu: - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes, w_dtypes) if args.cuda: - test_cuda(lib, test_cases) + test_cuda(lib, test_cases, tensor_dtypes, w_dtypes) if args.bang: - test_bang(lib, test_cases) + test_bang(lib, test_cases, tensor_dtypes, w_dtypes) if args.ascend: - test_ascend(lib, test_cases) + test_ascend(lib, test_cases, tensor_dtypes, w_dtypes) if not (args.cpu or args.cuda or args.bang or args.ascend): - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes, w_dtypes) print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/rotary_embedding.py b/operatorspy/tests/rotary_embedding.py index 081d2f91..1442ad04 100644 --- a/operatorspy/tests/rotary_embedding.py +++ b/operatorspy/tests/rotary_embedding.py @@ -2,7 +2,7 @@ from ctypes import c_float, POINTER, c_void_p, c_int32, c_uint64, Structure, byref import sys import os - +import time sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) from operatorspy import ( @@ -19,9 +19,12 @@ U64, ) -from operatorspy.tests.test_utils import get_args +from operatorspy.tests.test_utils import get_args, synchronize_device import torch +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 class RoPEDescriptor(Structure): _fields_ = [("device", c_int32)] @@ -135,53 +138,116 @@ def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16): ) assert torch.allclose(t, ans, atol=1e-4, rtol=1e-2) + + if PROFILE: + # Profiling PyTorch implementation + posTmp_d = posTmp.to(torch_device) + + for i in range(NUM_PRERUN): + if torch_device == 'mlu' or torch_device == 'npu': + _ = rotary_embedding(t, posTmp, theta, "cpu").to(torch_device) + else: + _ = rotary_embedding(t, posTmp_d, theta, torch_device) + synchronize_device(torch_device) + + start_time = time.time() + for i in range(NUM_ITERATIONS): + if torch_device == 'mlu' or torch_device == 'npu': + _ = rotary_embedding(t, posTmp, theta, "cpu").to(torch_device) + else: + _ = rotary_embedding(t, posTmp_d, theta, torch_device) + synchronize_device(torch_device) + + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" pytorch time: {elapsed * 1000 :6f} ms") + + # Profiling C Operators implementation + for i in range(NUM_PRERUN): + check_error( + lib.infiniopRoPE( + descriptor, + workspace.data_ptr() if workspace is not None else None, + workspace_size.value, + t_tensor.data, + pos_tensor.data, + sin_table_tensor.data, + cos_table_tensor.data, + None, + ) + ) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopRoPE( + descriptor, + workspace.data_ptr() if workspace is not None else None, + workspace_size.value, + t_tensor.data, + pos_tensor.data, + sin_table_tensor.data, + cos_table_tensor.data, + None, + ) + ) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed * 1000 :6f} ms") + check_error(lib.infiniopDestroyRoPEDescriptor(descriptor)) -def test_cpu(lib, test_cases): +def test_cpu(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CPU handle = create_handle(lib, device) - for shape, strides, dtype in test_cases: - test(lib, handle, "cpu", shape, strides, dtype) + for shape, strides in test_cases: + for tensor_dtype in tensor_dtypes: + test(lib, handle, "cpu", shape, strides, tensor_dtype) destroy_handle(lib, handle) -def test_cuda(lib, test_cases): +def test_cuda(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CUDA handle = create_handle(lib, device) - for shape, strides, dtype in test_cases: - test(lib, handle, "cuda", shape, strides, dtype) + for shape, strides in test_cases: + for tensor_dtype in tensor_dtypes: + test(lib, handle, "cuda", shape, strides, tensor_dtype) destroy_handle(lib, handle) -def test_bang(lib, test_cases): +def test_bang(lib, test_cases, tensor_dtypes): import torch_mlu device = DeviceEnum.DEVICE_BANG handle = create_handle(lib, device) - for shape, strides, dtype in test_cases: - test(lib, handle, "mlu", shape, strides, dtype) + for shape, strides in test_cases: + for tensor_dtype in tensor_dtypes: + test(lib, handle, "mlu", shape, strides, tensor_dtype) destroy_handle(lib, handle) -def test_ascend(lib, test_cases) : +def test_ascend(lib, test_cases, tensor_dtypes) : import torch_npu device = DeviceEnum.DEVICE_ASCEND handle = create_handle(lib, device) - for shape, strides, dtype in test_cases: - test(lib, handle, "npu", shape, strides, dtype) + for shape, strides in test_cases: + for tensor_dtype in tensor_dtypes: + test(lib, handle, "npu", shape, strides, tensor_dtype) destroy_handle(lib, handle) if __name__ == "__main__": test_cases = [ - ((1, 32, 128), None, torch.float16), - ((1, 32, 64), None, torch.float16), + ((1, 32, 128), None), + ((1, 32, 64), None), # 昇腾暂不满足这个用例,最后一维度 <=32 会有问题,可能与其核心 # 接口 GatherMask 的内部实现相关,目前 48 64 128 都可以支持 - ((4, 1, 32), None, torch.float16), - ((1, 32, 128), None, torch.float16), + ((4, 1, 32), None), + ((1, 32, 128), None), - ((3, 32, 128), (8000, 200, 1), torch.float16), + ((3, 32, 128), (8000, 200, 1)), + ] + tensor_dtypes = [ + torch.float16, ] args = get_args() lib = open_lib() @@ -214,14 +280,17 @@ def test_ascend(lib, test_cases) : lib.infiniopDestroyRoPEDescriptor.argtypes = [ infiniopRoPEDescriptor_t, ] + + if args.profile: + PROFILE = True if args.cpu: - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) if args.cuda: - test_cuda(lib, test_cases) + test_cuda(lib, test_cases, tensor_dtypes) if args.bang: - test_bang(lib, test_cases) + test_bang(lib, test_cases, tensor_dtypes) if args.ascend: - test_ascend(lib, test_cases) + test_ascend(lib, test_cases, tensor_dtypes) if not (args.cpu or args.cuda or args.bang or args.ascend): - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/swiglu.py b/operatorspy/tests/swiglu.py index 7fb447a1..a8772630 100644 --- a/operatorspy/tests/swiglu.py +++ b/operatorspy/tests/swiglu.py @@ -2,6 +2,7 @@ import ctypes import sys import os +import time sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) from operatorspy import ( @@ -17,9 +18,20 @@ rearrange_tensor, ) -from operatorspy.tests.test_utils import get_args +from operatorspy.tests.test_utils import get_args, synchronize_device +from enum import Enum, auto import torch +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + class SwiGLUDescriptor(Structure): _fields_ = [("device", c_int32)] @@ -28,138 +40,72 @@ class SwiGLUDescriptor(Structure): infiniopSwiGLUDescriptor_t = POINTER(SwiGLUDescriptor) -def swiglu(a, b): - - return a * b / (1 + torch.exp(-b.float()).to(b.dtype)) +def print_discrepancy(actual, expected, atol=0, rtol=1e-2): + if actual.shape != expected.shape: + raise ValueError("Tensors must have the same shape to compare.") -def test_out_of_place( - lib, - handle, - torch_device, - shape, - a_stride=None, - b_stride=None, - c_stride=None, - dtype=torch.float16, - sync=None, -): - print( - f"Testing SwiGLU on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} dtype:{dtype}" - ) - a = torch.rand(shape, dtype=dtype).to(torch_device) - b = torch.rand(shape, dtype=dtype).to(torch_device) - c = torch.rand(shape, dtype=dtype).to(torch_device) - - if a_stride is not None: - a = rearrange_tensor(a, a_stride) - if b_stride is not None: - b = rearrange_tensor(b, b_stride) - if c_stride is not None: - c = rearrange_tensor(c, c_stride) - ans = swiglu(a, b) + # Calculate the difference mask based on atol and rtol + diff_mask = torch.abs(actual - expected) > (atol + rtol * torch.abs(expected)) + diff_indices = torch.nonzero(diff_mask, as_tuple=False) - if sync is not None: - sync() + # Widths for columns (adjusted based on data) + col_width = [18, 18, 18, 18] + decimal_places = [0, 12, 12, 12] - a_tensor = to_tensor(a, lib) - b_tensor = to_tensor(b, lib) - c_tensor = to_tensor(c, lib) - descriptor = infiniopSwiGLUDescriptor_t() - check_error( - lib.infiniopCreateSwiGLUDescriptor( - handle, - ctypes.byref(descriptor), - c_tensor.descriptor, - a_tensor.descriptor, - b_tensor.descriptor, + for idx in diff_indices: + index_tuple = tuple(idx.tolist()) + print( + f" > Index: {str(index_tuple):<{col_width[0]}}" + f"actual: \033[31m{actual[index_tuple]:<{col_width[1]}.{decimal_places[1]}f}\033[0m" + f"expect: \033[32m{expected[index_tuple]:<{col_width[2]}.{decimal_places[2]}f}\033[0m" + f"delta: \033[33m{actual[index_tuple] - expected[index_tuple]:<{col_width[3]}.{decimal_places[3]}f}\033[0m" ) - ) - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - a_tensor.descriptor.contents.invalidate() - b_tensor.descriptor.contents.invalidate() - c_tensor.descriptor.contents.invalidate() + return diff_indices - check_error( - lib.infiniopSwiGLU( - descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None - ) - ) - assert torch.allclose(c, ans, atol=1e-4, rtol=1e-2) - print("out-of-place Test passed!") +def swiglu(a, b): - check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor)) + return a * b / (1 + torch.exp(-b.float()).to(b.dtype)) -def test_in_place1( +def test( lib, handle, torch_device, shape, a_stride=None, b_stride=None, + c_stride=None, dtype=torch.float16, + inplace=Inplace.OUT_OF_PLACE, sync=None, ): + print( + f"Testing SwiGLU on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} dtype:{dtype} " + f"inplace:{inplace}" + ) a = torch.rand(shape, dtype=dtype).to(torch_device) b = torch.rand(shape, dtype=dtype).to(torch_device) + c = ( + torch.rand(shape, dtype=dtype).to(torch_device) + if inplace == Inplace.OUT_OF_PLACE + else (a if inplace == Inplace.INPLACE_A else b) + ) if a_stride is not None: a = rearrange_tensor(a, a_stride) if b_stride is not None: b = rearrange_tensor(b, b_stride) - ans = swiglu(a, b) - - if sync is not None: - sync() - - a_tensor = to_tensor(a, lib) - b_tensor = to_tensor(b, lib) - descriptor = infiniopSwiGLUDescriptor_t() - check_error( - lib.infiniopCreateSwiGLUDescriptor( - handle, - ctypes.byref(descriptor), - a_tensor.descriptor, - a_tensor.descriptor, - b_tensor.descriptor, - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - a_tensor.descriptor.contents.invalidate() - b_tensor.descriptor.contents.invalidate() + if c_stride is not None and inplace == Inplace.OUT_OF_PLACE: + c = rearrange_tensor(c, c_stride) - check_error( - lib.infiniopSwiGLU( - descriptor, a_tensor.data, a_tensor.data, b_tensor.data, None - ) + c = ( + c + if inplace == Inplace.OUT_OF_PLACE + else (a if inplace == Inplace.INPLACE_A else b) ) - assert torch.allclose(a, ans, atol=1e-4, rtol=1e-2) - print("in-place1 Test passed!") - - check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor)) - - -def test_in_place2( - lib, - handle, - torch_device, - shape, - a_stride=None, - b_stride=None, - dtype=torch.float16, - sync=None, -): - a = torch.rand(shape, dtype=dtype).to(torch_device) - b = torch.rand(shape, dtype=dtype).to(torch_device) - - if a_stride is not None: - a = rearrange_tensor(a, a_stride) - if b_stride is not None: - b = rearrange_tensor(b, b_stride) ans = swiglu(a, b) if sync is not None: @@ -167,12 +113,17 @@ def test_in_place2( a_tensor = to_tensor(a, lib) b_tensor = to_tensor(b, lib) + c_tensor = ( + to_tensor(c, lib) + if inplace == Inplace.OUT_OF_PLACE + else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor) + ) descriptor = infiniopSwiGLUDescriptor_t() check_error( lib.infiniopCreateSwiGLUDescriptor( handle, ctypes.byref(descriptor), - b_tensor.descriptor, + c_tensor.descriptor, a_tensor.descriptor, b_tensor.descriptor, ) @@ -181,83 +132,153 @@ def test_in_place2( # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel a_tensor.descriptor.contents.invalidate() b_tensor.descriptor.contents.invalidate() + c_tensor.descriptor.contents.invalidate() check_error( lib.infiniopSwiGLU( - descriptor, b_tensor.data, a_tensor.data, b_tensor.data, None + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None ) ) - assert torch.allclose(b, ans, atol=1e-4, rtol=1e-2) + assert torch.allclose(c, ans, atol=1e-4, rtol=1e-2) + + if PROFILE: + # Profiling PyTorch implementation + for i in range(NUM_PRERUN): + _ = swiglu(a, b) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = swiglu(a, b) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" pytorch time: {elapsed * 1000 :6f} ms") + + # Profiling C Operators implementation + for i in range(NUM_PRERUN): + check_error( + lib.infiniopSwiGLU( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + ) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopSwiGLU( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + ) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed * 1000 :6f} ms") check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor)) -def test_cpu(lib, test_cases): +def test_cpu(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CPU handle = create_handle(lib, device) - for shape, a_stride, b_stride, c_stride, dtype in test_cases: - test_out_of_place( - lib, handle, "cpu", shape, a_stride, b_stride, c_stride, dtype - ) - test_in_place1(lib, handle, "cpu", shape, a_stride, b_stride, dtype) - test_in_place2(lib, handle, "cpu", shape, a_stride, b_stride, dtype) + for shape, a_stride, b_stride, c_stride, inplace in test_cases: + for tensor_dtype in tensor_dtypes: + test( + lib, + handle, + "cpu", + shape, + a_stride, + b_stride, + c_stride, + tensor_dtype, + inplace, + ) destroy_handle(lib, handle) -def test_cuda(lib, test_cases): +def test_cuda(lib, test_cases, tensor_dtypes): device = DeviceEnum.DEVICE_CUDA handle = create_handle(lib, device) - for shape, a_stride, b_stride, c_stride, dtype in test_cases: - test_out_of_place( - lib, handle, "cuda", shape, a_stride, b_stride, c_stride, dtype - ) - test_in_place1(lib, handle, "cuda", shape, a_stride, b_stride, dtype) - test_in_place2(lib, handle, "cuda", shape, a_stride, b_stride, dtype) + for shape, a_stride, b_stride, c_stride, inplace in test_cases: + for tensor_dtype in tensor_dtypes: + test( + lib, + handle, + "cuda", + shape, + a_stride, + b_stride, + c_stride, + tensor_dtype, + inplace, + ) destroy_handle(lib, handle) -def test_bang(lib, test_cases): +def test_bang(lib, test_cases, tensor_dtypes): import torch_mlu + device = DeviceEnum.DEVICE_BANG handle = create_handle(lib, device) - for shape, a_stride, b_stride, c_stride, dtype in test_cases: - test_out_of_place( - lib, handle, "mlu", shape, a_stride, b_stride, c_stride, dtype - ) - test_in_place1(lib, handle, "mlu", shape, a_stride, b_stride, dtype) - test_in_place2(lib, handle, "mlu", shape, a_stride, b_stride, dtype) + for shape, a_stride, b_stride, c_stride, inplace in test_cases: + for tensor_dtype in tensor_dtypes: + test( + lib, + handle, + "mlu", + shape, + a_stride, + b_stride, + c_stride, + tensor_dtype, + inplace, + ) destroy_handle(lib, handle) -def test_ascend(lib, test_cases): +def test_ascend(lib, test_cases, tensor_dtypes): import torch_npu + device = DeviceEnum.DEVICE_ASCEND handle = create_handle(lib, device) - for shape, a_stride, b_stride, c_stride, dtype in test_cases: - test_out_of_place( - lib, handle, "npu", shape, a_stride, b_stride, c_stride, dtype, torch.npu.synchronize - ) - test_in_place1(lib, handle, "npu", shape, a_stride, b_stride, dtype, torch.npu.synchronize) - test_in_place2(lib, handle, "npu", shape, a_stride, b_stride, dtype, torch.npu.synchronize) + for shape, a_stride, b_stride, c_stride, inplace in test_cases: + for tensor_dtype in tensor_dtypes: + test( + lib, + handle, + "cpu", + shape, + a_stride, + b_stride, + c_stride, + tensor_dtype, + inplace, + torch.npu.synchronize, + ) - destroy_handle(lib, handle) + destroy_handle(lib, handle) if __name__ == "__main__": test_cases = [ - # shape, a_stride, b_stride, c_stride, dtype - ((13, 4), None, None, None, torch.float16), - ((13, 4), (10, 1), (10, 1), (10, 1), torch.float16), - ((16, 5632), None, None, None, torch.float16), - ((16, 5632), (13312, 1), (13312, 1), (13312, 1), torch.float16), + # shape, a_stride, b_stride, c_stride, inplace + ((13, 4), None, None, None, Inplace.OUT_OF_PLACE), + ((13, 4), None, None, None, Inplace.INPLACE_A), + ((13, 4), None, None, None, Inplace.INPLACE_B), + ((13, 4), (10, 1), (10, 1), (10, 1), Inplace.OUT_OF_PLACE), + ((16, 5632), None, None, None, Inplace.OUT_OF_PLACE), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1), Inplace.OUT_OF_PLACE), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1), Inplace.INPLACE_A), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1), Inplace.INPLACE_B), + ] + tensor_dtypes = [ + torch.float16, ] args = get_args() lib = open_lib() @@ -285,12 +306,16 @@ def test_ascend(lib, test_cases): infiniopSwiGLUDescriptor_t, ] + if args.profile: + PROFILE = True if args.cpu: - test_cpu(lib, test_cases) + test_cpu(lib, test_cases, tensor_dtypes) if args.cuda: - test_cuda(lib, test_cases) + test_cuda(lib, test_cases, tensor_dtypes) if args.bang: - test_bang(lib, test_cases) + test_bang(lib, test_cases, tensor_dtypes) if args.ascend: - test_ascend(lib, test_cases) + test_ascend(lib, test_cases, tensor_dtypes) + if not (args.cpu or args.cuda or args.bang): + test_cpu(lib, test_cases, tensor_dtypes) print("\033[92mTest passed!\033[0m")