From 75b78d967823959ca961e8f4fb36aa41c428003d Mon Sep 17 00:00:00 2001 From: Zimin Li Date: Thu, 21 Nov 2024 18:08:27 +0800 Subject: [PATCH 1/5] Refactor add to binary elementwise operation and add sub, mul, div, pow, mod, min, max --- include/ops/div/div.h | 27 +++ include/ops/max/max.h | 27 +++ include/ops/min/min.h | 27 +++ include/ops/mod/mod.h | 27 +++ include/ops/mul/mul.h | 27 +++ include/ops/pow/pow.h | 27 +++ include/ops/sub/sub.h | 27 +++ operatorspy/tests/add.py | 53 ++++-- operatorspy/tests/div.py | 208 ++++++++++++++++++++++ operatorspy/tests/max.py | 190 +++++++++++++++++++++ operatorspy/tests/min.py | 189 ++++++++++++++++++++ operatorspy/tests/mod.py | 208 ++++++++++++++++++++++ operatorspy/tests/mul.py | 189 ++++++++++++++++++++ operatorspy/tests/pow.py | 187 ++++++++++++++++++++ operatorspy/tests/sub.py | 189 ++++++++++++++++++++ src/devices/cpu/common_cpu.cc | 25 ++- src/ops/add/cpu/add_cpu.cc | 101 ----------- src/ops/add/cpu/add_cpu.h | 33 ---- src/ops/add/cuda/add.cc | 81 --------- src/ops/add/cuda/add.cu | 116 ------------- src/ops/add/cuda/add.cuh | 37 ---- src/ops/add/operator.cc | 95 ++++------- src/ops/binary/binary.h | 72 ++++++++ src/ops/binary/cpu/binary_cpu.cc | 176 +++++++++++++++++++ src/ops/binary/cpu/binary_cpu.h | 37 ++++ src/ops/binary/cuda/binary.cc | 82 +++++++++ src/ops/binary/cuda/binary.cu | 285 +++++++++++++++++++++++++++++++ src/ops/binary/cuda/binary.cuh | 38 +++++ src/ops/binary/operator.cc | 74 ++++++++ src/ops/div/operator.cc | 44 +++++ src/ops/max/operator.cc | 44 +++++ src/ops/min/operator.cc | 44 +++++ src/ops/mod/operator.cc | 44 +++++ src/ops/mul/operator.cc | 44 +++++ src/ops/pow/operator.cc | 44 +++++ src/ops/sub/operator.cc | 44 +++++ 36 files changed, 2705 insertions(+), 457 deletions(-) create mode 100644 include/ops/div/div.h create mode 100644 include/ops/max/max.h create mode 100644 include/ops/min/min.h create mode 100644 include/ops/mod/mod.h create mode 100644 include/ops/mul/mul.h create mode 100644 include/ops/pow/pow.h create mode 100644 include/ops/sub/sub.h create mode 100644 operatorspy/tests/div.py create mode 100644 operatorspy/tests/max.py create mode 100644 operatorspy/tests/min.py create mode 100644 operatorspy/tests/mod.py create mode 100644 operatorspy/tests/mul.py create mode 100644 operatorspy/tests/pow.py create mode 100644 operatorspy/tests/sub.py delete mode 100644 src/ops/add/cpu/add_cpu.cc delete mode 100644 src/ops/add/cpu/add_cpu.h delete mode 100644 src/ops/add/cuda/add.cc delete mode 100644 src/ops/add/cuda/add.cu delete mode 100644 src/ops/add/cuda/add.cuh create mode 100644 src/ops/binary/binary.h create mode 100644 src/ops/binary/cpu/binary_cpu.cc create mode 100644 src/ops/binary/cpu/binary_cpu.h create mode 100644 src/ops/binary/cuda/binary.cc create mode 100644 src/ops/binary/cuda/binary.cu create mode 100644 src/ops/binary/cuda/binary.cuh create mode 100644 src/ops/binary/operator.cc create mode 100644 src/ops/div/operator.cc create mode 100644 src/ops/max/operator.cc create mode 100644 src/ops/min/operator.cc create mode 100644 src/ops/mod/operator.cc create mode 100644 src/ops/mul/operator.cc create mode 100644 src/ops/pow/operator.cc create mode 100644 src/ops/sub/operator.cc diff --git a/include/ops/div/div.h b/include/ops/div/div.h new file mode 100644 index 00000000..a749272d --- /dev/null +++ b/include/ops/div/div.h @@ -0,0 +1,27 @@ +#ifndef DIV_H +#define DIV_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct DivDescriptor { + Device device; +} DivDescriptor; + +typedef DivDescriptor *infiniopDivDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateDivDescriptor(infiniopHandle_t handle, + infiniopDivDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniopStatus_t infiniopDiv(infiniopDivDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc); + +#endif diff --git a/include/ops/max/max.h b/include/ops/max/max.h new file mode 100644 index 00000000..446dcb2e --- /dev/null +++ b/include/ops/max/max.h @@ -0,0 +1,27 @@ +#ifndef MAX_H +#define MAX_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct MaxDescriptor { + Device device; +} MaxDescriptor; + +typedef MaxDescriptor *infiniopMaxDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateMaxDescriptor(infiniopHandle_t handle, + infiniopMaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniopStatus_t infiniopMax(infiniopMaxDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyMaxDescriptor(infiniopMaxDescriptor_t desc); + +#endif diff --git a/include/ops/min/min.h b/include/ops/min/min.h new file mode 100644 index 00000000..28767f93 --- /dev/null +++ b/include/ops/min/min.h @@ -0,0 +1,27 @@ +#ifndef MIN_H +#define MIN_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct MinDescriptor { + Device device; +} MinDescriptor; + +typedef MinDescriptor *infiniopMinDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateMinDescriptor(infiniopHandle_t handle, + infiniopMinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniopStatus_t infiniopMin(infiniopMinDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyMinDescriptor(infiniopMinDescriptor_t desc); + +#endif diff --git a/include/ops/mod/mod.h b/include/ops/mod/mod.h new file mode 100644 index 00000000..0065c885 --- /dev/null +++ b/include/ops/mod/mod.h @@ -0,0 +1,27 @@ +#ifndef MOD_H +#define MOD_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct ModDescriptor { + Device device; +} ModDescriptor; + +typedef ModDescriptor *infiniopModDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateModDescriptor(infiniopHandle_t handle, + infiniopModDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniopStatus_t infiniopMod(infiniopModDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyModDescriptor(infiniopModDescriptor_t desc); + +#endif diff --git a/include/ops/mul/mul.h b/include/ops/mul/mul.h new file mode 100644 index 00000000..4a30afe4 --- /dev/null +++ b/include/ops/mul/mul.h @@ -0,0 +1,27 @@ +#ifndef MUL_H +#define MUL_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct MulDescriptor { + Device device; +} MulDescriptor; + +typedef MulDescriptor *infiniopMulDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateMulDescriptor(infiniopHandle_t handle, + infiniopMulDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniopStatus_t infiniopMul(infiniopMulDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc); + +#endif diff --git a/include/ops/pow/pow.h b/include/ops/pow/pow.h new file mode 100644 index 00000000..3ba6eb7c --- /dev/null +++ b/include/ops/pow/pow.h @@ -0,0 +1,27 @@ +#ifndef POW_H +#define POW_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct PowDescriptor { + Device device; +} PowDescriptor; + +typedef PowDescriptor *infiniopPowDescriptor_t; + +__C __export infiniopStatus_t infiniopCreatePowDescriptor(infiniopHandle_t handle, + infiniopPowDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniopStatus_t infiniopPow(infiniopPowDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyPowDescriptor(infiniopPowDescriptor_t desc); + +#endif diff --git a/include/ops/sub/sub.h b/include/ops/sub/sub.h new file mode 100644 index 00000000..f5577720 --- /dev/null +++ b/include/ops/sub/sub.h @@ -0,0 +1,27 @@ +#ifndef SUB_H +#define SUB_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct SubDescriptor { + Device device; +} SubDescriptor; + +typedef SubDescriptor *infiniopSubDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateSubDescriptor(infiniopHandle_t handle, + infiniopSubDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniopStatus_t infiniopSub(infiniopSubDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream); + +__C __export infiniopStatus_t infiniopDestroySubDescriptor(infiniopSubDescriptor_t desc); + +#endif diff --git a/operatorspy/tests/add.py b/operatorspy/tests/add.py index d766208c..b4d2203e 100644 --- a/operatorspy/tests/add.py +++ b/operatorspy/tests/add.py @@ -2,6 +2,7 @@ import ctypes import sys import os +import time sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) from operatorspy import ( @@ -19,11 +20,18 @@ from enum import Enum, auto import torch +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function (elementwise.cu), +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 class Inplace(Enum): OUT_OF_PLACE = auto() INPLACE_A = auto() INPLACE_B = auto() + INPLACE_AB = auto() class AddDescriptor(Structure): @@ -34,6 +42,10 @@ class AddDescriptor(Structure): def add(x, y): + if PROFILE: + ans = torch.add(x, y) + torch.cuda.synchronize() + return ans return torch.add(x, y) @@ -55,13 +67,20 @@ def test( return a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device) - b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) + b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) if inplace != Inplace.INPLACE_AB else a c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b) - ans = add(a, b) + for i in range(NUM_PRERUN if PROFILE else 1): + ans = add(a, b) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = add(a, b) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") a_tensor = to_tensor(a, lib) - b_tensor = to_tensor(b, lib) + b_tensor = to_tensor(b, lib) if inplace != Inplace.INPLACE_AB else a_tensor c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor) descriptor = infiniopAddDescriptor_t() @@ -74,9 +93,20 @@ def test( b_tensor.descriptor, ) ) - lib.infiniopAdd( - descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None - ) + + for i in range(NUM_PRERUN if PROFILE else 1): + lib.infiniopAdd( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + lib.infiniopAdd( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + assert torch.allclose(c, ans, atol=0, rtol=1e-3) check_error(lib.infiniopDestroyAddDescriptor(descriptor)) @@ -113,20 +143,17 @@ def test_bang(lib, test_cases): if __name__ == "__main__": test_cases = [ # c_shape, a_shape, b_shape, inplace - # ((32, 150, 512000), (32, 150, 512000), (32, 150, 512000), Inplace.OUT_OF_PLACE), # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE), # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE), ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE), ((), (), (), Inplace.OUT_OF_PLACE), - ((3, 3), (3, 3), (3, 3), Inplace.OUT_OF_PLACE), - ((2, 20, 3), (2, 1, 3), (2, 20, 3), Inplace.OUT_OF_PLACE), - ((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_A), - ((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_B), - ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), - ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE), ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE), ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), ] args = get_args() lib = open_lib() diff --git a/operatorspy/tests/div.py b/operatorspy/tests/div.py new file mode 100644 index 00000000..c303c635 --- /dev/null +++ b/operatorspy/tests/div.py @@ -0,0 +1,208 @@ +from ctypes import POINTER, Structure, c_int32, c_void_p +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch + +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function (elementwise.cu), +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + INPLACE_AB = auto() + + +class DivDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopDivDescriptor_t = POINTER(DivDescriptor) + +def find_and_print_differing_indices( + a, b, c, ans, atol=0, rtol=1e-2 +): + if c.shape != ans.shape: + raise ValueError("Tensors must have the same shape to compare.") + + # Calculate the difference mask based on atol and rtol + diff_mask = torch.abs(c - ans) > (atol + rtol * torch.abs(ans)) + diff_indices = torch.nonzero(diff_mask, as_tuple=False) + + # Print the indices and the differing elements + for idx in diff_indices: + index_tuple = tuple(idx.tolist()) + print( + f"Index: {index_tuple}, a: {a[index_tuple]}, b: {b[index_tuple]}, y element: {c[index_tuple]}, ans element: {ans[index_tuple]}" + ) + + return diff_indices + + +def div(x, y): + if PROFILE: + ans = torch.div(x, y) + torch.cuda.synchronize() + return ans + return torch.div(x, y) + + +def test( + lib, + handle, + torch_device, + c_shape, + a_shape, + b_shape, + tensor_dtype=torch.float16, + inplace=Inplace.OUT_OF_PLACE, +): + print( + f"Testing Div on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}" + ) + if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE: + print("Unsupported test: broadcasting does not support in-place") + return + + a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device) * 10 + b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) * 2 if inplace != Inplace.INPLACE_AB else a + c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b) + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = div(a, b) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = div(a, b) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + a_tensor = to_tensor(a, lib) + b_tensor = to_tensor(b, lib) if inplace != Inplace.INPLACE_AB else a_tensor + c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor) + descriptor = infiniopDivDescriptor_t() + + check_error( + lib.infiniopCreateDivDescriptor( + handle, + ctypes.byref(descriptor), + c_tensor.descriptor, + a_tensor.descriptor, + b_tensor.descriptor, + ) + ) + + for i in range(NUM_PRERUN if PROFILE else 1): + lib.infiniopDiv( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + lib.infiniopDiv( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + assert torch.allclose(c, ans, atol=0, rtol=1e-3, equal_nan=True) + check_error(lib.infiniopDestroyDivDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_bang(lib, test_cases): + import torch_mlu + + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # c_shape, a_shape, b_shape, inplace + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE), + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE), + ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE), + ((), (), (), Inplace.OUT_OF_PLACE), + ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE), + ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE), + ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateDivDescriptor.restype = c_int32 + lib.infiniopCreateDivDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopDivDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopDiv.restype = c_int32 + lib.infiniopDiv.argtypes = [ + infiniopDivDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyDivDescriptor.restype = c_int32 + lib.infiniopDestroyDivDescriptor.argtypes = [ + infiniopDivDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if not (args.cpu or args.cuda or args.bang): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/max.py b/operatorspy/tests/max.py new file mode 100644 index 00000000..39eaf2b3 --- /dev/null +++ b/operatorspy/tests/max.py @@ -0,0 +1,190 @@ +from ctypes import POINTER, Structure, c_int32, c_void_p +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch + +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function (elementwise.cu), +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + INPLACE_AB = auto() + + +class MaxDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopMaxDescriptor_t = POINTER(MaxDescriptor) + + +def max(x, y): + if PROFILE: + ans = torch.max(x, y) + torch.cuda.synchronize() + return ans + return torch.max(x, y) + + +def test( + lib, + handle, + torch_device, + c_shape, + a_shape, + b_shape, + tensor_dtype=torch.float16, + inplace=Inplace.OUT_OF_PLACE, +): + print( + f"Testing Max on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}" + ) + if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE: + print("Unsupported test: broadcasting does not support in-place") + return + + a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device) * 10 + b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) if inplace != Inplace.INPLACE_AB else a + c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b) + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = max(a, b) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = max(a, b) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + a_tensor = to_tensor(a, lib) + b_tensor = to_tensor(b, lib) if inplace != Inplace.INPLACE_AB else a_tensor + c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor) + descriptor = infiniopMaxDescriptor_t() + + check_error( + lib.infiniopCreateMaxDescriptor( + handle, + ctypes.byref(descriptor), + c_tensor.descriptor, + a_tensor.descriptor, + b_tensor.descriptor, + ) + ) + + for i in range(NUM_PRERUN if PROFILE else 1): + lib.infiniopMax( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + lib.infiniopMax( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + assert torch.allclose(c, ans, atol=0, rtol=0) + check_error(lib.infiniopDestroyMaxDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_bang(lib, test_cases): + import torch_mlu + + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # c_shape, a_shape, b_shape, inplace + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE), + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE), + ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE), + ((), (), (), Inplace.OUT_OF_PLACE), + ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE), + ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE), + ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ((33, 333, 333), (33, 333, 333), (33, 333, 333), Inplace.OUT_OF_PLACE), + ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateMaxDescriptor.restype = c_int32 + lib.infiniopCreateMaxDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopMaxDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopMax.restype = c_int32 + lib.infiniopMax.argtypes = [ + infiniopMaxDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyMaxDescriptor.restype = c_int32 + lib.infiniopDestroyMaxDescriptor.argtypes = [ + infiniopMaxDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if not (args.cpu or args.cuda or args.bang): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/min.py b/operatorspy/tests/min.py new file mode 100644 index 00000000..050ea64a --- /dev/null +++ b/operatorspy/tests/min.py @@ -0,0 +1,189 @@ +from ctypes import POINTER, Structure, c_int32, c_void_p +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch + +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function (elementwise.cu), +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + INPLACE_AB = auto() + + +class MinDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopMinDescriptor_t = POINTER(MinDescriptor) + + +def min(x, y): + if PROFILE: + ans = torch.min(x, y) + torch.cuda.synchronize() + return ans + return torch.min(x, y) + + +def test( + lib, + handle, + torch_device, + c_shape, + a_shape, + b_shape, + tensor_dtype=torch.float16, + inplace=Inplace.OUT_OF_PLACE, +): + print( + f"Testing Min on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}" + ) + if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE: + print("Unsupported test: broadcasting does not support in-place") + return + + a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device) * 10 + b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) if inplace != Inplace.INPLACE_AB else a + c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b) + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = min(a, b) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = min(a, b) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + a_tensor = to_tensor(a, lib) + b_tensor = to_tensor(b, lib) if inplace != Inplace.INPLACE_AB else a_tensor + c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor) + descriptor = infiniopMinDescriptor_t() + + check_error( + lib.infiniopCreateMinDescriptor( + handle, + ctypes.byref(descriptor), + c_tensor.descriptor, + a_tensor.descriptor, + b_tensor.descriptor, + ) + ) + + for i in range(NUM_PRERUN if PROFILE else 1): + lib.infiniopMin( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + lib.infiniopMin( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + assert torch.allclose(c, ans, atol=0, rtol=0) + check_error(lib.infiniopDestroyMinDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_bang(lib, test_cases): + import torch_mlu + + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # c_shape, a_shape, b_shape, inplace + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE), + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE), + ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE), + ((), (), (), Inplace.OUT_OF_PLACE), + ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE), + ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE), + ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateMinDescriptor.restype = c_int32 + lib.infiniopCreateMinDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopMinDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopMin.restype = c_int32 + lib.infiniopMin.argtypes = [ + infiniopMinDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyMinDescriptor.restype = c_int32 + lib.infiniopDestroyMinDescriptor.argtypes = [ + infiniopMinDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if not (args.cpu or args.cuda or args.bang): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/mod.py b/operatorspy/tests/mod.py new file mode 100644 index 00000000..342e3662 --- /dev/null +++ b/operatorspy/tests/mod.py @@ -0,0 +1,208 @@ +from ctypes import POINTER, Structure, c_int32, c_void_p +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch + +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function (elementwise.cu), +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + INPLACE_AB = auto() + + +class ModDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopModDescriptor_t = POINTER(ModDescriptor) + +def find_and_print_differing_indices( + a, b, c, ans, atol=0, rtol=1e-2 +): + if c.shape != ans.shape: + raise ValueError("Tensors must have the same shape to compare.") + + # Calculate the difference mask based on atol and rtol + diff_mask = torch.abs(c - ans) > (atol + rtol * torch.abs(ans)) + diff_indices = torch.nonzero(diff_mask, as_tuple=False) + + # Print the indices and the differing elements + for idx in diff_indices: + index_tuple = tuple(idx.tolist()) + print( + f"Index: {index_tuple}, a: {a[index_tuple]}, b: {b[index_tuple]}, y element: {c[index_tuple]}, ans element: {ans[index_tuple]}" + ) + + return diff_indices + + +def mod(x, y): + if PROFILE: + ans = torch.fmod(x, y) + torch.cuda.synchronize() + return ans + return torch.fmod(x, y) + + +def test( + lib, + handle, + torch_device, + c_shape, + a_shape, + b_shape, + tensor_dtype=torch.float16, + inplace=Inplace.OUT_OF_PLACE, +): + print( + f"Testing Mod on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}" + ) + if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE: + print("Unsupported test: broadcasting does not support in-place") + return + + a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device) * 10 + b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) if inplace != Inplace.INPLACE_AB else a + c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b) + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = mod(a, b) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = mod(a, b) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + a_tensor = to_tensor(a, lib) + b_tensor = to_tensor(b, lib) if inplace != Inplace.INPLACE_AB else a_tensor + c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor) + descriptor = infiniopModDescriptor_t() + + check_error( + lib.infiniopCreateModDescriptor( + handle, + ctypes.byref(descriptor), + c_tensor.descriptor, + a_tensor.descriptor, + b_tensor.descriptor, + ) + ) + + for i in range(NUM_PRERUN if PROFILE else 1): + lib.infiniopMod( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + lib.infiniopMod( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + assert torch.allclose(c, ans, atol=0, rtol=0, equal_nan=True) + check_error(lib.infiniopDestroyModDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_bang(lib, test_cases): + import torch_mlu + + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # c_shape, a_shape, b_shape, inplace + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE), + ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE), + ((), (), (), Inplace.OUT_OF_PLACE), + ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE), + ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE), + ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateModDescriptor.restype = c_int32 + lib.infiniopCreateModDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopModDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopMod.restype = c_int32 + lib.infiniopMod.argtypes = [ + infiniopModDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyModDescriptor.restype = c_int32 + lib.infiniopDestroyModDescriptor.argtypes = [ + infiniopModDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if not (args.cpu or args.cuda or args.bang): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/mul.py b/operatorspy/tests/mul.py new file mode 100644 index 00000000..57085b42 --- /dev/null +++ b/operatorspy/tests/mul.py @@ -0,0 +1,189 @@ +from ctypes import POINTER, Structure, c_int32, c_void_p +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch + +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function (elementwise.cu), +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + INPLACE_AB = auto() + + +class MulDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopMulDescriptor_t = POINTER(MulDescriptor) + + +def mul(x, y): + if PROFILE: + ans = torch.mul(x, y) + torch.cuda.synchronize() + return ans + return torch.mul(x, y) + + +def test( + lib, + handle, + torch_device, + c_shape, + a_shape, + b_shape, + tensor_dtype=torch.float16, + inplace=Inplace.OUT_OF_PLACE, +): + print( + f"Testing Mul on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}" + ) + if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE: + print("Unsupported test: broadcasting does not support in-place") + return + + a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device) * 10 + b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) if inplace != Inplace.INPLACE_AB else a + c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b) + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = mul(a, b) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = mul(a, b) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + a_tensor = to_tensor(a, lib) + b_tensor = to_tensor(b, lib) if inplace != Inplace.INPLACE_AB else a_tensor + c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor) + descriptor = infiniopMulDescriptor_t() + + check_error( + lib.infiniopCreateMulDescriptor( + handle, + ctypes.byref(descriptor), + c_tensor.descriptor, + a_tensor.descriptor, + b_tensor.descriptor, + ) + ) + + for i in range(NUM_PRERUN if PROFILE else 1): + lib.infiniopMul( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + lib.infiniopMul( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + assert torch.allclose(c, ans, atol=1e-6, rtol=1e-3) + check_error(lib.infiniopDestroyMulDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_bang(lib, test_cases): + import torch_mlu + + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # c_shape, a_shape, b_shape, inplace + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE), + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE), + ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE), + ((), (), (), Inplace.OUT_OF_PLACE), + ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE), + ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE), + ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateMulDescriptor.restype = c_int32 + lib.infiniopCreateMulDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopMulDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopMul.restype = c_int32 + lib.infiniopMul.argtypes = [ + infiniopMulDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyMulDescriptor.restype = c_int32 + lib.infiniopDestroyMulDescriptor.argtypes = [ + infiniopMulDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if not (args.cpu or args.cuda or args.bang): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/pow.py b/operatorspy/tests/pow.py new file mode 100644 index 00000000..e5106b36 --- /dev/null +++ b/operatorspy/tests/pow.py @@ -0,0 +1,187 @@ +from ctypes import POINTER, Structure, c_int32, c_void_p +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch + +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function (elementwise.cu), +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + INPLACE_AB = auto() + + +class PowDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopPowDescriptor_t = POINTER(PowDescriptor) + + +def pow(x, y): + if PROFILE: + ans = torch.pow(x, y) + torch.cuda.synchronize() + return ans + return torch.pow(x, y) + + +def test( + lib, + handle, + torch_device, + c_shape, + a_shape, + b_shape, + tensor_dtype=torch.float16, + inplace=Inplace.OUT_OF_PLACE, +): + print( + f"Testing Pow on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}" + ) + + a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device) * 5 + b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) * 5 if inplace != Inplace.INPLACE_AB else a + c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b) + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = pow(a, b) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = pow(a, b) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + a_tensor = to_tensor(a, lib) + b_tensor = to_tensor(b, lib) if inplace != Inplace.INPLACE_AB else a_tensor + c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor) + descriptor = infiniopPowDescriptor_t() + + check_error( + lib.infiniopCreatePowDescriptor( + handle, + ctypes.byref(descriptor), + c_tensor.descriptor, + a_tensor.descriptor, + b_tensor.descriptor, + ) + ) + + for i in range(NUM_PRERUN if PROFILE else 1): + lib.infiniopPow( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + lib.infiniopPow( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + assert torch.allclose(c, ans, atol=1e-7, rtol=1e-3, equal_nan=True) + check_error(lib.infiniopDestroyPowDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_bang(lib, test_cases): + import torch_mlu + + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # c_shape, a_shape, b_shape, inplace + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE), + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE), + ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE), + ((), (), (), Inplace.OUT_OF_PLACE), + ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE), + ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE), + ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ((33, 333, 333), (33, 333, 333), (33, 333, 333), Inplace.OUT_OF_PLACE), + ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreatePowDescriptor.restype = c_int32 + lib.infiniopCreatePowDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopPowDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopPow.restype = c_int32 + lib.infiniopPow.argtypes = [ + infiniopPowDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyPowDescriptor.restype = c_int32 + lib.infiniopDestroyPowDescriptor.argtypes = [ + infiniopPowDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if not (args.cpu or args.cuda or args.bang): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/sub.py b/operatorspy/tests/sub.py new file mode 100644 index 00000000..cc8eb406 --- /dev/null +++ b/operatorspy/tests/sub.py @@ -0,0 +1,189 @@ +from ctypes import POINTER, Structure, c_int32, c_void_p +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch + +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function (elementwise.cu), +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + INPLACE_AB = auto() + + +class SubDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopSubDescriptor_t = POINTER(SubDescriptor) + + +def subtract(x, y): + if PROFILE: + ans = torch.subtract(x, y) + torch.cuda.synchronize() + return ans + return torch.subtract(x, y) + + +def test( + lib, + handle, + torch_device, + c_shape, + a_shape, + b_shape, + tensor_dtype=torch.float16, + inplace=Inplace.OUT_OF_PLACE, +): + print( + f"Testing Sub on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}" + ) + if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE: + print("Unsupported test: broadcasting does not support in-place") + return + + a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device) + b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) if inplace != Inplace.INPLACE_AB else a + c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b) + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = subtract(a, b) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = subtract(a, b) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + a_tensor = to_tensor(a, lib) + b_tensor = to_tensor(b, lib) if inplace != Inplace.INPLACE_AB else a_tensor + c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor) + descriptor = infiniopSubDescriptor_t() + + check_error( + lib.infiniopCreateSubDescriptor( + handle, + ctypes.byref(descriptor), + c_tensor.descriptor, + a_tensor.descriptor, + b_tensor.descriptor, + ) + ) + + for i in range(NUM_PRERUN if PROFILE else 1): + lib.infiniopSub( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + lib.infiniopSub( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + assert torch.allclose(c, ans, atol=0, rtol=1e-3) + check_error(lib.infiniopDestroySubDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_bang(lib, test_cases): + import torch_mlu + + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # c_shape, a_shape, b_shape, inplace + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE), + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE), + ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE), + ((), (), (), Inplace.OUT_OF_PLACE), + ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE), + ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE), + ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateSubDescriptor.restype = c_int32 + lib.infiniopCreateSubDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopSubDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopSub.restype = c_int32 + lib.infiniopSub.argtypes = [ + infiniopSubDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroySubDescriptor.restype = c_int32 + lib.infiniopDestroySubDescriptor.argtypes = [ + infiniopSubDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if not (args.cpu or args.cuda or args.bang): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/src/devices/cpu/common_cpu.cc b/src/devices/cpu/common_cpu.cc index b5b5f0fd..3e097446 100644 --- a/src/devices/cpu/common_cpu.cc +++ b/src/devices/cpu/common_cpu.cc @@ -44,22 +44,19 @@ uint16_t f32_to_f16(float val) { int32_t exponent = ((f32 >> 23) & 0xFF) - 127;// Extract and de-bias the exponent uint32_t mantissa = f32 & 0x7FFFFF; // Extract the mantissa (fraction part) - if (exponent == 128) {// Special case for Inf and NaN - if (mantissa != 0) { - // NaN - return sign | 0x7C00 | (mantissa >> 13);// Convert the NaN payload - } else { - // Infinity - return sign | 0x7C00; + if (exponent >= 31) {// Special cases for Inf and NaN + // NaN + if (exponent == 128 && mantissa != 0) { + return sign | 0x7E00; } - } else if (exponent > 15) { // Overflow: Larger than float16 max - return sign | 0x7C00; // Return infinity - } else if (exponent >= -14) {// Normalized float16 + // Infinity + return sign | 0x7C00; + } else if (exponent >= -14) {// Normalized case return sign | ((exponent + 15) << 10) | (mantissa >> 13); - } else if (exponent >= -24) { // Subnormal float16 (leading denormals) - mantissa |= 0x800000; // Add implicit leading 1 - int32_t shift = -exponent - 1;// Calculate shift for subnormal numbers - return sign | (mantissa >> (13 + shift)); + } else if (exponent >= -24) { + mantissa |= 0x800000;// Add implicit leading 1 + mantissa >>= (-14 - exponent); + return sign | (mantissa >> 13); } else { // Too small for subnormal: return signed zero return sign; diff --git a/src/ops/add/cpu/add_cpu.cc b/src/ops/add/cpu/add_cpu.cc deleted file mode 100644 index 649fa052..00000000 --- a/src/ops/add/cpu/add_cpu.cc +++ /dev/null @@ -1,101 +0,0 @@ -#include "add_cpu.h" -#include "../../../devices/cpu/common_cpu.h" -#include "../../utils.h" - -inline void incrementOne(uint64_t *indices, uint64_t const *shape, uint64_t ndim) { - for (int64_t i = ndim - 1; i >= 0; --i) { - if (++indices[i] != shape[i]) { - return; - } - indices[i] = 0; - } -} - -inline uint64_t compactToFlat(uint64_t const *indices, uint64_t const *strides, uint64_t ndim) { - return std::inner_product(indices, indices + ndim, strides, uint64_t(0)); -} - -infiniopStatus_t cpuCreateAddDescriptor(infiniopHandle_t, - AddCpuDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b) { - uint64_t ndim = c->ndim; - if (!isValidBroadcastShape(a, b, c)) { - return STATUS_BAD_TENSOR_SHAPE; - } - if (!is_contiguous(a) || !is_contiguous(b) || !is_contiguous(c)) { - return STATUS_BAD_TENSOR_STRIDES; - } - if (c->dt != F16 && c->dt != F32) { - return STATUS_BAD_TENSOR_DTYPE; - } - if (c->dt != a->dt || c->dt != b->dt) { - return STATUS_BAD_TENSOR_DTYPE; - } - - uint64_t c_data_size = std::accumulate(c->shape, c->shape + c->ndim, 1ULL, std::multiplies()); - - // get the adjusted strides for a and b - uint64_t *a_strides = new uint64_t[ndim]; - uint64_t *b_strides = new uint64_t[ndim]; - for (size_t i = 0; i < ndim; ++i) { - a_strides[i] = (i < ndim - a->ndim || c->shape[i] != a->shape[i + a->ndim - ndim]) ? 0 : a->strides[i + a->ndim - ndim]; - b_strides[i] = (i < ndim - b->ndim || c->shape[i] != b->shape[i + b->ndim - ndim]) ? 0 : b->strides[i + b->ndim - ndim]; - } - - uint64_t *c_indices = new uint64_t[ndim]; - std::fill(c_indices, c_indices + ndim, 0); - - *desc_ptr = new AddCpuDescriptor{ - DevCpu, - c->dt, - ndim, - c_data_size, - c->shape, - a_strides, - b_strides, - c_indices, - }; - - return STATUS_SUCCESS; -} - -infiniopStatus_t cpuDestroyAddDescriptor(AddCpuDescriptor_t desc) { - delete[] desc->a_strides; - delete[] desc->b_strides; - delete[] desc->c_indices; - delete desc; - return STATUS_SUCCESS; -} - -template -infiniopStatus_t add_cpu(AddCpuDescriptor_t desc, void *c, void const *a, void const *b) { - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto c_ = reinterpret_cast(c); - const auto &indices = desc->c_indices; - - for (uint64_t i = 0; i < desc->c_data_size; ++i, incrementOne(indices, desc->c_shape, desc->ndim)) { - auto a_index = compactToFlat(indices, desc->a_strides, desc->ndim); - auto b_index = compactToFlat(indices, desc->b_strides, desc->ndim); - if constexpr (std::is_same::value) { - c_[i] = f32_to_f16(f16_to_f32(a_[a_index]) + f16_to_f32(b_[b_index])); - } else { - c_[i] = a_[a_index] + b_[b_index]; - } - } - return STATUS_SUCCESS; -} - -infiniopStatus_t cpuAdd(AddCpuDescriptor_t desc, - void *c, void const *a, void const *b, - void *stream) { - if (desc->dtype == F16) { - return add_cpu(desc, c, a, b); - } - if (desc->dtype == F32) { - return add_cpu(desc, c, a, b); - } - return STATUS_BAD_TENSOR_DTYPE; -} diff --git a/src/ops/add/cpu/add_cpu.h b/src/ops/add/cpu/add_cpu.h deleted file mode 100644 index 42e62435..00000000 --- a/src/ops/add/cpu/add_cpu.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef __CPU_ADD_H__ -#define __CPU_ADD_H__ - -#include "operators.h" -#include -#include - -struct AddCpuDescriptor { - Device device; - DT dtype; - uint64_t ndim; - uint64_t c_data_size; - uint64_t const *c_shape; - uint64_t const *a_strides; - uint64_t const *b_strides; - uint64_t *c_indices; -}; - -typedef struct AddCpuDescriptor *AddCpuDescriptor_t; - -infiniopStatus_t cpuCreateAddDescriptor(infiniopHandle_t, - AddCpuDescriptor_t *, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b); - -infiniopStatus_t cpuAdd(AddCpuDescriptor_t desc, - void *c, void const *a, void const *b, - void *stream); - -infiniopStatus_t cpuDestroyAddDescriptor(AddCpuDescriptor_t desc); - -#endif diff --git a/src/ops/add/cuda/add.cc b/src/ops/add/cuda/add.cc deleted file mode 100644 index b010894f..00000000 --- a/src/ops/add/cuda/add.cc +++ /dev/null @@ -1,81 +0,0 @@ -#include "add.cuh" -#include "../../../devices/cuda/common_cuda.h" -#include "../../utils.h" - -infiniopStatus_t cudaCreateAddDescriptor(CudaHandle_t handle, - AddCudaDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b) { - uint64_t ndim = c->ndim; - if (!isValidBroadcastShape(a, b, c)) { - return STATUS_BAD_TENSOR_SHAPE; - } - if (!is_contiguous(a) || !is_contiguous(b) || !is_contiguous(c)) { - return STATUS_BAD_TENSOR_STRIDES; - } - if (c->dt != F16 && c->dt != F32) { - return STATUS_BAD_TENSOR_DTYPE; - } - if (c->dt != a->dt || c->dt != b->dt) { - return STATUS_BAD_TENSOR_DTYPE; - } - bool broadcasted = false; - if (ndim != a->ndim || ndim != b->ndim) { - broadcasted = true; - } else { - for (uint64_t i = 0; i < ndim; ++i) { - if (c->shape[i] != a->shape[i] || c->shape[i] != b->shape[i]) { - broadcasted = true; - break; - } - } - } - - uint64_t c_data_size = std::accumulate(c->shape, c->shape + c->ndim, 1ULL, std::multiplies()); - - // get the adjusted strides for a and b - int64_t *a_strides = new int64_t[ndim]; - int64_t *b_strides = new int64_t[ndim]; - for (size_t i = 0; i < ndim; ++i) { - a_strides[i] = (i < ndim - a->ndim || c->shape[i] != a->shape[i + a->ndim - ndim]) ? 0 : a->strides[i + a->ndim - ndim]; - b_strides[i] = (i < ndim - b->ndim || c->shape[i] != b->shape[i + b->ndim - ndim]) ? 0 : b->strides[i + b->ndim - ndim]; - } - - cudaDeviceProp prop; - cudaGetDeviceProperties(&prop, handle->device_id); - - int64_t *a_strides_d, *b_strides_d, *c_strides_d; - checkCudaErrorWithCode(cudaMalloc(&a_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); - checkCudaErrorWithCode(cudaMalloc(&b_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); - checkCudaErrorWithCode(cudaMalloc(&c_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); - checkCudaErrorWithCode(cudaMemcpy(a_strides_d, a_strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); - checkCudaErrorWithCode(cudaMemcpy(b_strides_d, b_strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); - checkCudaErrorWithCode(cudaMemcpy(c_strides_d, c->strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); - - *desc_ptr = new AddCudaDescriptor{ - DevNvGpu, - c->dt, - handle->device_id, - ndim, - c_data_size, - static_cast(prop.maxGridSize[0]), - a_strides_d, - b_strides_d, - c_strides_d, - broadcasted, - }; - - delete[] a_strides; - delete[] b_strides; - - return STATUS_SUCCESS; -} - -infiniopStatus_t cudaDestroyAddDescriptor(AddCudaDescriptor_t desc) { - checkCudaErrorWithCode(cudaFree((void *) desc->a_strides), STATUS_EXECUTION_FAILED); - checkCudaErrorWithCode(cudaFree((void *) desc->b_strides), STATUS_EXECUTION_FAILED); - checkCudaErrorWithCode(cudaFree((void *) desc->c_strides), STATUS_EXECUTION_FAILED); - delete desc; - return STATUS_SUCCESS; -} diff --git a/src/ops/add/cuda/add.cu b/src/ops/add/cuda/add.cu deleted file mode 100644 index 9d9aefcb..00000000 --- a/src/ops/add/cuda/add.cu +++ /dev/null @@ -1,116 +0,0 @@ -#include "../../../devices/cuda/common_cuda.h" -#include "../../utils.h" -#include "add.cuh" - -/** - * @brief A templated vector struct that supports element-wise addition on arrays. - * - * @tparam T - The access data type for elements in the vector. - * @tparam TComp - The computation data type used for arithmetic operations. - * @tparam N - The number of elements of type T in the vector for a single access. - */ -template -struct vecN { - T data[N]; - - __device__ __forceinline__ vecN operator+(const vecN &other) const { - vecN result; - - for (int i = 0; i < N; ++i) { - if constexpr (std::is_same::value) { - result.data[i] = data[i] + other.data[i]; - } else { - constexpr static size_t pack_size = sizeof(T) / sizeof(TComp); - auto data_ = reinterpret_cast *>(result.data); - data_[i] = std::move(reinterpret_cast const *>(data)[i] + - reinterpret_cast const *>(other.data)[i]); - } - } - - return result; - } - - __device__ __forceinline__ const T &operator[](size_t i) const { - return data[i]; - } -}; - -template -__global__ void add( - Tdata *c, - const Tdata *a, - const Tdata *b, - const int64_t *a_strides, - const int64_t *b_strides, - const int64_t *c_strides, - uint64_t data_size, - uint64_t ndim, - uint64_t offset, - bool broadcasted, - unsigned pack_size) { - uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset; - - if (idx < data_size) { - if (broadcasted) { - idx *= pack_size; - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto c_ = reinterpret_cast(c); -#pragma unroll - for (size_t i = 0; i < pack_size; ++i) { - auto a_idx = getDstOffset(idx + i, ndim, c_strides, a_strides); - auto b_idx = getDstOffset(idx + i, ndim, c_strides, b_strides); - c_[idx + i] = a_[a_idx] + b_[b_idx]; - } - return; - } - c[idx] = a[idx] + b[idx]; - } -} - -template -void _add_nv_gpu(AddCudaDescriptor_t desc, Tdata *c, Tdata const *a, Tdata const *b, uint64_t data_size, uint64_t pack_size, uint64_t offset, void *stream) { - if (data_size == 0) { - return; - } - dim3 blockDims = dim3(std::min(static_cast(256), data_size)); - dim3 gridDims = dim3(std::min(ROUND_UP_DIV(data_size, blockDims.x), desc->max_grid_size)); - uint64_t step = gridDims.x * blockDims.x; - - cudaStream_t cuda_stream = reinterpret_cast(stream); - -#pragma unroll - for (uint64_t i = 0; i < data_size; i += step) { - add<<>>( - c, a, b, desc->a_strides, desc->b_strides, desc->c_strides, offset + data_size, desc->ndim, offset + i, desc->broadcasted, pack_size); - } -} - -template -infiniopStatus_t add_nv_gpu(AddCudaDescriptor_t desc, void *c, void const *a, void const *b, void *stream, uint64_t pack_size) { - const auto data_size = desc->c_data_size / pack_size; - const auto a_vec = reinterpret_cast(a); - const auto b_vec = reinterpret_cast(b); - const auto c_vec = reinterpret_cast(c); - _add_nv_gpu(desc, c_vec, a_vec, b_vec, data_size, pack_size, 0, stream); - - const auto remainder = desc->c_data_size % pack_size; - const auto a_ = reinterpret_cast(a); - const auto b_ = reinterpret_cast(b); - const auto c_ = reinterpret_cast(c); - _add_nv_gpu(desc, c_, a_, b_, remainder, 1, data_size * pack_size, stream); - return STATUS_SUCCESS; -} - -infiniopStatus_t cudaAdd(AddCudaDescriptor_t desc, - void *c, void const *a, void const *b, - void *stream) { - checkCudaError(cudaSetDevice(desc->device_id)); - if (desc->dtype == F16) { - return add_nv_gpu, half>(desc, c, a, b, stream, 8); - } - if (desc->dtype == F32) { - return add_nv_gpu, float>(desc, c, a, b, stream, 4); - } - return STATUS_BAD_TENSOR_DTYPE; -} diff --git a/src/ops/add/cuda/add.cuh b/src/ops/add/cuda/add.cuh deleted file mode 100644 index 03a181eb..00000000 --- a/src/ops/add/cuda/add.cuh +++ /dev/null @@ -1,37 +0,0 @@ -#ifndef __CUDA_ADD_H__ -#define __CUDA_ADD_H__ - -#include "../../../devices/cuda/common_cuda.h" -#include "../../../devices/cuda/cuda_handle.h" -#include "operators.h" -#include -#include - -struct AddCudaDescriptor { - Device device; - DT dtype; - int device_id; - uint64_t ndim; - uint64_t c_data_size; - uint64_t max_grid_size; - int64_t const *a_strides; - int64_t const *b_strides; - int64_t const *c_strides; - bool broadcasted; -}; - -typedef struct AddCudaDescriptor *AddCudaDescriptor_t; - -infiniopStatus_t cudaCreateAddDescriptor(CudaHandle_t, - AddCudaDescriptor_t *, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b); - -infiniopStatus_t cudaAdd(AddCudaDescriptor_t desc, - void *c, void const *a, void const *b, - void *stream); - -infiniopStatus_t cudaDestroyAddDescriptor(AddCudaDescriptor_t desc); - -#endif diff --git a/src/ops/add/operator.cc b/src/ops/add/operator.cc index c2a30ea8..b2c51f70 100644 --- a/src/ops/add/operator.cc +++ b/src/ops/add/operator.cc @@ -1,73 +1,44 @@ +#include "../binary/binary.h" #include "../utils.h" -#include "operators.h" #include "ops/add/add.h" +#include "tensor/tensor_descriptor.h" -#ifdef ENABLE_CPU -#include "cpu/add_cpu.h" -#endif -#ifdef ENABLE_NV_GPU -#include "../../devices/cuda/cuda_handle.h" -#include "cuda/add.cuh" -#endif +struct _AddDescriptor { + Device device; + infiniopBinaryDescriptor_t binary_desc; +}; -__C infiniopStatus_t infiniopCreateAddDescriptor( - infiniopHandle_t handle, - infiniopAddDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b) { - switch (handle->device) { -#ifdef ENABLE_CPU - case DevCpu: - return cpuCreateAddDescriptor(handle, (AddCpuDescriptor_t *) desc_ptr, c, a, b); -#endif -#ifdef ENABLE_NV_GPU - case DevNvGpu: { - return cudaCreateAddDescriptor((CudaHandle_t) handle, (AddCudaDescriptor_t *) desc_ptr, c, a, b); - } +typedef struct _AddDescriptor *_AddDescriptor_t; -#endif -#ifdef ENABLE_CAMBRICON_MLU - // TODO -#endif - } - return STATUS_BAD_DEVICE; -} +__C __export infiniopStatus_t infiniopCreateAddDescriptor(infiniopHandle_t handle, + infiniopAddDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + // binary desc + infiniopBinaryDescriptor_t binary_desc; + CHECK_STATUS(infiniopCreateBinaryDescriptor(handle, &binary_desc, c_desc, a_desc, b_desc, BinaryMode::Add), STATUS_SUCCESS); -__C infiniopStatus_t infiniopAdd(infiniopAddDescriptor_t desc, void *c, void const *a, void const *b, void *stream) { - switch (desc->device) { -#ifdef ENABLE_CPU - case DevCpu: - return cpuAdd((AddCpuDescriptor_t) desc, c, a, b, stream); -#endif -#ifdef ENABLE_NV_GPU - case DevNvGpu: { - return cudaAdd((AddCudaDescriptor_t) desc, c, a, b, stream); - } + *(_AddDescriptor_t *) desc_ptr = new _AddDescriptor{ + handle->device, + binary_desc, + }; -#endif -#ifdef ENABLE_CAMBRICON_MLU - // TODO -#endif - } - return STATUS_BAD_DEVICE; + return STATUS_SUCCESS; } -__C infiniopStatus_t infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) { - switch (desc->device) { -#ifdef ENABLE_CPU - case DevCpu: - return cpuDestroyAddDescriptor((AddCpuDescriptor_t) desc); -#endif -#ifdef ENABLE_NV_GPU - case DevNvGpu: { - return cudaDestroyAddDescriptor((AddCudaDescriptor_t) desc); - } +__C __export infiniopStatus_t infiniopAdd(infiniopAddDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream) { + auto _desc = (_AddDescriptor_t) desc; + CHECK_STATUS(infiniopBinary(_desc->binary_desc, c, a, b, stream), STATUS_SUCCESS); + return STATUS_SUCCESS; +} -#endif -#ifdef ENABLE_CAMBRICON_MLU - // TODO -#endif - } - return STATUS_BAD_DEVICE; +__C __export infiniopStatus_t infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) { + CHECK_STATUS(infiniopDestroyBinaryDescriptor(((_AddDescriptor_t) desc)->binary_desc), STATUS_SUCCESS); + delete desc; + return STATUS_SUCCESS; } diff --git a/src/ops/binary/binary.h b/src/ops/binary/binary.h new file mode 100644 index 00000000..1e953b51 --- /dev/null +++ b/src/ops/binary/binary.h @@ -0,0 +1,72 @@ +#ifndef BINARY_H +#define BINARY_H + +#include "export.h" +#include "operators.h" +#include + +typedef struct BinaryDescriptor { + Device device; +} BinaryDescriptor; + +typedef BinaryDescriptor *infiniopBinaryDescriptor_t; + +/** + * @brief Represents all the currently defined binary operations. + */ +struct BinaryMode { + /** + * Note: new operation type should be added **BEFORE** the "Count" type + * "Count" should remain as the last element. New Arithmetic operations should be + * added in the 'Arithmetic operations' section, i.e., before the 'Logical operations:' section. + */ + enum Mode { + // Arithmetic operations: + Add, + Subtract, + Multiply, + Divide, + Pow, + Mod, + Max, + Min, + BitwiseAnd, + BitwiseOr, + BitwiseXor, + + // Logical operations: + // **TODO Not currently supported** + // Requires Boolean data type + And, + Or, + Xor, + Less, + LessOrEqual, + Equal, + Greater, + GreaterOrEqual, + + Count,///< Number of binary operation types (marker for counting purposes). + }; + + // This static constant holds the total number of defined binary operations. + static const size_t numBinaryMode = Count; +}; + +__C infiniopStatus_t infiniopCreateBinaryDescriptor(infiniopHandle_t handle, + infiniopBinaryDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b, + int mode); + +__C infiniopStatus_t infiniopBinary(infiniopBinaryDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream); + +__C infiniopStatus_t infiniopDestroyBinaryDescriptor(infiniopBinaryDescriptor_t desc); + + +#endif diff --git a/src/ops/binary/cpu/binary_cpu.cc b/src/ops/binary/cpu/binary_cpu.cc new file mode 100644 index 00000000..6ee50655 --- /dev/null +++ b/src/ops/binary/cpu/binary_cpu.cc @@ -0,0 +1,176 @@ +#include "binary_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../utils.h" + +infiniopStatus_t cpuCreateBinaryDescriptor(infiniopHandle_t, + BinaryCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b, + int mode) { + uint64_t ndim = c->ndim; + if (!isValidBroadcastShape(a, b, c)) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (!is_contiguous(a) || !is_contiguous(b) || !is_contiguous(c)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (c->dt != F16 && c->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (c->dt != a->dt || c->dt != b->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + bool broadcasted = false; + if (ndim != a->ndim || ndim != b->ndim) { + broadcasted = true; + } else { + for (uint64_t i = 0; i < ndim; ++i) { + if (c->shape[i] != a->shape[i] || c->shape[i] != b->shape[i]) { + broadcasted = true; + break; + } + } + } + if (mode < 0 || mode >= BinaryMode::numBinaryMode) { + return STATUS_BAD_PARAM; + } + // bitwise operations are only valid for integral types + if (c->dt.exponent != 0 && (mode == BinaryMode::BitwiseAnd || mode == BinaryMode::BitwiseOr || mode == BinaryMode::BitwiseXor)) { + return STATUS_BAD_PARAM; + } + + uint64_t c_data_size = std::accumulate(c->shape, c->shape + c->ndim, 1ULL, std::multiplies()); + + int64_t *a_strides = nullptr; + int64_t *b_strides = nullptr; + int64_t *c_strides = nullptr; + uint64_t *c_shape = new uint64_t[ndim]; + std::copy(c->shape, c->shape + ndim, c_shape); + + if (broadcasted) { + // get the adjusted strides for a and b + a_strides = new int64_t[ndim]; + b_strides = new int64_t[ndim]; + c_strides = new int64_t[ndim]; + for (size_t i = 0; i < ndim; ++i) { + a_strides[i] = (i < ndim - a->ndim || c->shape[i] != a->shape[i + a->ndim - ndim]) ? 0 : a->strides[i + a->ndim - ndim]; + b_strides[i] = (i < ndim - b->ndim || c->shape[i] != b->shape[i + b->ndim - ndim]) ? 0 : b->strides[i + b->ndim - ndim]; + c_strides[i] = c->strides[i]; + } + } + + *desc_ptr = new BinaryCpuDescriptor{ + DevCpu, + c->dt, + ndim, + c_data_size, + c_shape, + a_strides, + b_strides, + c_strides, + broadcasted, + mode, + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuDestroyBinaryDescriptor(BinaryCpuDescriptor_t desc) { + delete[] desc->a_strides; + delete[] desc->b_strides; + delete[] desc->c_strides; + delete[] desc->c_shape; + delete desc; + return STATUS_SUCCESS; +} + +// binary elementwise kernel that performs the actual binary computation +template +Tdata binary_kernel(const Tdata &a, const Tdata &b, int mode) { + switch (mode) { + // Arithmetic operations: + case BinaryMode::Add: + return a + b; + case BinaryMode::Subtract: + return a - b; + case BinaryMode::Multiply: + return a * b; + case BinaryMode::Divide: + return a / b; + case BinaryMode::Pow: + return std::pow(a, b); + case BinaryMode::Mod: + if constexpr (std::is_floating_point_v) { + return std::fmod(a, b); + } else { + return a % b; + } + case BinaryMode::Max: + if constexpr (std::is_floating_point_v) { + return std::fmax(a, b); + } else { + return std::max(a, b); + } + case BinaryMode::Min: + if constexpr (std::is_floating_point_v) { + return std::fmin(a, b); + } else { + return std::min(a, b); + } + case BinaryMode::BitwiseAnd: + if constexpr (std::is_integral_v) { + return a & b; + } else { + ASSERT(false); + } + case BinaryMode::BitwiseOr: + if constexpr (std::is_integral_v) { + return a | b; + } else { + ASSERT(false); + } + case BinaryMode::BitwiseXor: + if constexpr (std::is_integral_v) { + return a ^ b; + } else { + ASSERT(false); + } + // Logical operations: + default: + fprintf(stderr, "Unrecognized binary operation: how did you get here??\n"); + ASSERT(false); + return a; + } +} + +template +infiniopStatus_t binary_cpu(BinaryCpuDescriptor_t desc, void *c, void const *a, void const *b) { + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); + +#pragma omp parallel for + for (uint64_t i = 0; i < desc->c_data_size; ++i) { + auto a_index = desc->broadcasted ? getDstOffset(i, desc->ndim, desc->c_strides, desc->a_strides) : i; + auto b_index = desc->broadcasted ? getDstOffset(i, desc->ndim, desc->c_strides, desc->b_strides) : i; + if constexpr (std::is_same::value) { + c_[i] = f32_to_f16(binary_kernel(f16_to_f32(a_[a_index]), f16_to_f32(b_[b_index]), desc->mode)); + } else { + c_[i] = binary_kernel(a_[a_index], b_[b_index], desc->mode); + } + } + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuBinary(BinaryCpuDescriptor_t desc, + void *c, void const *a, void const *b, + void *stream) { + if (desc->dtype == F16) { + return binary_cpu(desc, c, a, b); + } + if (desc->dtype == F32) { + return binary_cpu(desc, c, a, b); + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/binary/cpu/binary_cpu.h b/src/ops/binary/cpu/binary_cpu.h new file mode 100644 index 00000000..b9502bd1 --- /dev/null +++ b/src/ops/binary/cpu/binary_cpu.h @@ -0,0 +1,37 @@ +#ifndef __CPU_BINARY_H__ +#define __CPU_BINARY_H__ + +#include "../binary.h" +#include "operators.h" +#include +#include + +struct BinaryCpuDescriptor { + Device device; + DT dtype; + uint64_t ndim; + uint64_t c_data_size; + uint64_t const *c_shape; + int64_t const *a_strides; + int64_t const *b_strides; + int64_t const *c_strides; + bool broadcasted; + int mode; +}; + +typedef struct BinaryCpuDescriptor *BinaryCpuDescriptor_t; + +infiniopStatus_t cpuCreateBinaryDescriptor(infiniopHandle_t, + BinaryCpuDescriptor_t *, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b, + int mode); + +infiniopStatus_t cpuBinary(BinaryCpuDescriptor_t desc, + void *c, void const *a, void const *b, + void *stream); + +infiniopStatus_t cpuDestroyBinaryDescriptor(BinaryCpuDescriptor_t desc); + +#endif diff --git a/src/ops/binary/cuda/binary.cc b/src/ops/binary/cuda/binary.cc new file mode 100644 index 00000000..9b91ea1f --- /dev/null +++ b/src/ops/binary/cuda/binary.cc @@ -0,0 +1,82 @@ +#include "binary.cuh" +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" + +infiniopStatus_t cudaCreateBinaryDescriptor(CudaHandle_t handle, + BinaryCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b, + int mode) { + uint64_t ndim = c->ndim; + if (!isValidBroadcastShape(a, b, c)) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (!is_contiguous(a) || !is_contiguous(b) || !is_contiguous(c)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (c->dt != F16 && c->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (c->dt != a->dt || c->dt != b->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + bool broadcasted = false; + if (ndim != a->ndim || ndim != b->ndim) { + broadcasted = true; + } else { + for (uint64_t i = 0; i < ndim; ++i) { + if (c->shape[i] != a->shape[i] || c->shape[i] != b->shape[i]) { + broadcasted = true; + break; + } + } + } + if (mode < 0 || mode >= BinaryMode::numBinaryMode) { + return STATUS_BAD_PARAM; + } + // bitwise operations are only valid for integral types + if (c->dt.exponent != 0 && (mode == BinaryMode::BitwiseAnd || mode == BinaryMode::BitwiseOr || mode == BinaryMode::BitwiseXor)) { + return STATUS_BAD_PARAM; + } + + uint64_t c_data_size = std::accumulate(c->shape, c->shape + c->ndim, 1ULL, std::multiplies()); + + char *strides_d = nullptr; + size_t stride_arr_size = ndim * sizeof(int64_t); + if (broadcasted) { + // get the adjusted strides for a and b + int64_t a_strides[ndim]; + int64_t b_strides[ndim]; + for (size_t i = 0; i < ndim; ++i) { + a_strides[i] = (i < ndim - a->ndim || c->shape[i] != a->shape[i + a->ndim - ndim]) ? 0 : a->strides[i + a->ndim - ndim]; + b_strides[i] = (i < ndim - b->ndim || c->shape[i] != b->shape[i + b->ndim - ndim]) ? 0 : b->strides[i + b->ndim - ndim]; + } + + // malloc and copy the strides to the device + checkCudaErrorWithCode(cudaMalloc(&strides_d, 3 * stride_arr_size), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMemcpy(strides_d, a_strides, stride_arr_size, cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); + checkCudaErrorWithCode(cudaMemcpy(strides_d + stride_arr_size, b_strides, stride_arr_size, cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); + checkCudaErrorWithCode(cudaMemcpy(strides_d + 2 * stride_arr_size, c->strides, stride_arr_size, cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); + } + + *desc_ptr = new BinaryCudaDescriptor{ + DevNvGpu, + c->dt, + handle->device_id, + ndim, + c_data_size, + static_cast(handle->prop.maxGridSize[0]), + reinterpret_cast(strides_d), + broadcasted, + mode, + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaDestroyBinaryDescriptor(BinaryCudaDescriptor_t desc) { + checkCudaErrorWithCode(cudaFree((void *) desc->strides_d), STATUS_EXECUTION_FAILED); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/binary/cuda/binary.cu b/src/ops/binary/cuda/binary.cu new file mode 100644 index 00000000..a690ad14 --- /dev/null +++ b/src/ops/binary/cuda/binary.cu @@ -0,0 +1,285 @@ +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" +#include "binary.cuh" + +/** + * @brief A templated vector type that supports binary elementwise operation. + * + * @tparam T - The access data type for elements in the vector. + * @tparam TComp - The computation data type used for arithmetic operations. + * @tparam N - The number of elements of type T in the vector for a single access. + */ +template +struct vecN { + T data[N]; + constexpr static size_t pack_size = sizeof(T) / sizeof(TComp); + + __device__ __forceinline__ vecN compute(const vecN &other, BinaryMode::Mode mode) const { + vecN result; + for (int i = 0; i < N; ++i) { + if constexpr (std::is_same::value) { + result[i] = binary_op(data[i], other[i], mode); + } else { + auto data_ = reinterpret_cast *>(result.data); + auto this_data = reinterpret_cast *>(data); + auto other_data = reinterpret_cast *>(other.data); + data_[i] = binary_op(this_data[i], other_data[i], mode); + } + } + return result; + } + + // Overloaded operators that call the compute method with the appropriate mode. + __device__ __forceinline__ vecN operator+(const vecN &other) const { + return compute(other, BinaryMode::Add); + } + + __device__ __forceinline__ vecN operator-(const vecN &other) const { + return compute(other, BinaryMode::Subtract); + } + + __device__ __forceinline__ vecN operator*(const vecN &other) const { + return compute(other, BinaryMode::Multiply); + } + + __device__ __forceinline__ vecN operator/(const vecN &other) const { + return compute(other, BinaryMode::Divide); + } + + __device__ __forceinline__ vecN operator&(const vecN &other) const { + return compute(other, BinaryMode::BitwiseAnd); + } + + __device__ __forceinline__ vecN operator|(const vecN &other) const { + return compute(other, BinaryMode::BitwiseOr); + } + + __device__ __forceinline__ vecN operator^(const vecN &other) const { + return compute(other, BinaryMode::BitwiseXor); + } + + __device__ __forceinline__ vecN pow(const vecN &other) const { + return compute(other, BinaryMode::Pow); + } + + __device__ __forceinline__ vecN mod(const vecN &other) const { + return compute(other, BinaryMode::Mod); + } + + __device__ __forceinline__ vecN max(const vecN &other) const { + return compute(other, BinaryMode::Max); + } + + __device__ __forceinline__ vecN min(const vecN &other) const { + return compute(other, BinaryMode::Min); + } + + __device__ __forceinline__ const T &operator[](size_t i) const { + return data[i]; + } + + __device__ __forceinline__ T &operator[](size_t i) { + return data[i]; + } +}; + + +template +__device__ __forceinline__ Tdata binary_op(const Tdata &a, const Tdata &b, int mode) { + switch (mode) { + // Arithmetic operations: + case BinaryMode::Add: + return a + b; + case BinaryMode::Subtract: + return a - b; + case BinaryMode::Multiply: + return a * b; + case BinaryMode::Divide: + return a / b; + case BinaryMode::Pow: + if constexpr (std::is_arithmetic_v) { + return std::pow(a, b); + } else if constexpr (std::is_same_v) { + return __float2half(std::pow(__half2float(a), __half2float(b))); + } else if constexpr (!std::is_same_v) { + return a.pow(b); + } + return a; + case BinaryMode::Mod: + if constexpr (std::is_floating_point_v) { + return std::fmod(a, b); + } else if constexpr (std::is_arithmetic_v) { + return a % b; + } else if constexpr (std::is_same_v) { + return __float2half(std::fmod(__half2float(a), __half2float(b))); + } else if constexpr (!std::is_same_v) { + return a.mod(b); + } + return a; + case BinaryMode::Max: + if constexpr (std::is_fundamental_v) { + if constexpr (std::is_floating_point_v) { + return std::fmax(a, b); + } else { + return std::max(a, b); + } + } else if constexpr (std::is_same_v) { + return __half2float(a) > __half2float(b) ? a : b; + } else if constexpr (!std::is_same_v) { + return a.max(b); + } + case BinaryMode::Min: + if constexpr (std::is_fundamental_v) { + if constexpr (std::is_floating_point_v) { + return std::fmin(a, b); + } else { + return std::min(a, b); + } + } else if constexpr (std::is_same_v) { + return __half2float(a) < __half2float(b) ? a : b; + } else if constexpr (!std::is_same_v) { + return a.min(b); + } + case BinaryMode::BitwiseAnd: + if constexpr (std::is_integral_v) { + return a & b; + } else { + printf("Error: Non-integral input to bitwise operatior\n"); + return a; + } + case BinaryMode::BitwiseOr: + if constexpr (std::is_integral_v) { + return a | b; + } else { + printf("Error: Non-integral input to bitwise operatior\n"); + return a; + } + case BinaryMode::BitwiseXor: + if constexpr (std::is_integral_v) { + return a ^ b; + } else { + printf("Error: Non-integral input to bitwise operatior\n"); + return a; + } + // Logical operations: + // TODO. Currently Not supported yet + default: + printf("Unrecognized binary operation: how did you get here??\n"); + return a; + } +} + +template +__global__ void binary( + Tdata *c, + const Tdata *a, + const Tdata *b, + const int64_t *__restrict__ a_strides, + const int64_t *__restrict__ b_strides, + const int64_t *__restrict__ c_strides, + uint64_t data_size, + uint64_t ndim, + uint64_t offset, + bool broadcasted, + unsigned pack_size, + const int mode) { + uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset; + + if (idx < data_size) { + if (broadcasted) { + idx *= pack_size; + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); +#pragma unroll + for (size_t i = 0; i < pack_size; ++i) { + auto a_idx = getDstOffset(idx + i, ndim, c_strides, a_strides); + auto b_idx = getDstOffset(idx + i, ndim, c_strides, b_strides); + c_[idx + i] = binary_op(a_[a_idx], b_[b_idx], mode); + } + return; + } + c[idx] = binary_op(a[idx], b[idx], mode); + } +} + +template +void _binary_nv_gpu(BinaryCudaDescriptor_t desc, Tdata *c, Tdata const *a, Tdata const *b, uint64_t data_size, uint64_t pack_size, uint64_t offset, void *stream) { + if (data_size == 0) { + return; + } + dim3 blockDims = dim3(std::min(static_cast(BLOCK_SIZE), data_size)); + dim3 gridDims = dim3(std::min(ROUND_UP_DIV(data_size, blockDims.x), desc->max_grid_size)); + uint64_t step = gridDims.x * blockDims.x; + + const int64_t *a_strides = nullptr; + const int64_t *b_strides = nullptr; + const int64_t *c_strides = nullptr; + if (desc->broadcasted) { + a_strides = desc->strides_d; + b_strides = desc->strides_d + desc->ndim; + c_strides = desc->strides_d + 2 * desc->ndim; + } + cudaStream_t cuda_stream = reinterpret_cast(stream); + +#pragma unroll + for (uint64_t i = 0; i < data_size; i += step) { + binary<<>>( + c, a, b, a_strides, b_strides, c_strides, offset + data_size, desc->ndim, offset + i, desc->broadcasted, pack_size, desc->mode); + } +} + +template +infiniopStatus_t binary_nv_gpu(BinaryCudaDescriptor_t desc, void *c, void const *a, void const *b, void *stream, uint64_t pack_size) { + const auto data_size = desc->c_data_size / pack_size; + const auto a_vec = reinterpret_cast(a); + const auto b_vec = reinterpret_cast(b); + const auto c_vec = reinterpret_cast(c); + _binary_nv_gpu(desc, c_vec, a_vec, b_vec, data_size, pack_size, 0, stream); + + const auto remainder = desc->c_data_size % pack_size; + const auto a_ = reinterpret_cast(a); + const auto b_ = reinterpret_cast(b); + const auto c_ = reinterpret_cast(c); + _binary_nv_gpu(desc, c_, a_, b_, remainder, 1, data_size * pack_size, stream); + cudaDeviceSynchronize(); + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaBinary(BinaryCudaDescriptor_t desc, + void *c, void const *a, void const *b, + void *stream) { + checkCudaError(cudaSetDevice(desc->device_id)); + if (desc->dtype == F16) { + switch (desc->mode) { + case BinaryMode::Add: + case BinaryMode::Subtract: + case BinaryMode::Multiply: + case BinaryMode::Divide: + return binary_nv_gpu, half, 256>(desc, c, a, b, stream, 8); + case BinaryMode::Pow: + return binary_nv_gpu, half, 128>(desc, c, a, b, stream, 8); + default: + return binary_nv_gpu, half, 64>(desc, c, a, b, stream, 8); + } + } + if (desc->dtype == F32) { + switch (desc->mode) { + case BinaryMode::Add: + case BinaryMode::Subtract: + case BinaryMode::Multiply: + case BinaryMode::Divide: + return binary_nv_gpu, float, 256>(desc, c, a, b, stream, 4); + case BinaryMode::Pow: + return binary_nv_gpu, float, 256>(desc, c, a, b, stream, 8); + case BinaryMode::Mod: + return binary_nv_gpu, float, 128>(desc, c, a, b, stream, 4); + case BinaryMode::Max: + case BinaryMode::Min: + return binary_nv_gpu, float, 64>(desc, c, a, b, stream, 4); + default: + return binary_nv_gpu, float, 256>(desc, c, a, b, stream, 4); + } + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/binary/cuda/binary.cuh b/src/ops/binary/cuda/binary.cuh new file mode 100644 index 00000000..efcc8adc --- /dev/null +++ b/src/ops/binary/cuda/binary.cuh @@ -0,0 +1,38 @@ +#ifndef __CUDA_BINARY_H__ +#define __CUDA_BINARY_H__ + +#include "../../../devices/cuda/common_cuda.h" +#include "../../../devices/cuda/cuda_handle.h" +#include "../binary.h" +#include "operators.h" +#include +#include + +struct BinaryCudaDescriptor { + Device device; + DT dtype; + int device_id; + uint64_t ndim; + uint64_t c_data_size; + uint64_t max_grid_size; + int64_t const *strides_d; + bool broadcasted; + int mode; +}; + +typedef struct BinaryCudaDescriptor *BinaryCudaDescriptor_t; + +infiniopStatus_t cudaCreateBinaryDescriptor(CudaHandle_t, + BinaryCudaDescriptor_t *, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b, + int mode); + +infiniopStatus_t cudaBinary(BinaryCudaDescriptor_t desc, + void *c, void const *a, void const *b, + void *stream); + +infiniopStatus_t cudaDestroyBinaryDescriptor(BinaryCudaDescriptor_t desc); + +#endif diff --git a/src/ops/binary/operator.cc b/src/ops/binary/operator.cc new file mode 100644 index 00000000..5ad7b94f --- /dev/null +++ b/src/ops/binary/operator.cc @@ -0,0 +1,74 @@ +#include "../utils.h" +#include "binary.h" +#include "operators.h" + +#ifdef ENABLE_CPU +#include "cpu/binary_cpu.h" +#endif +#ifdef ENABLE_NV_GPU +#include "../../devices/cuda/cuda_handle.h" +#include "cuda/binary.cuh" +#endif + +__C infiniopStatus_t infiniopCreateBinaryDescriptor( + infiniopHandle_t handle, + infiniopBinaryDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b, + int mode) { + switch (handle->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuCreateBinaryDescriptor(handle, (BinaryCpuDescriptor_t *) desc_ptr, c, a, b, mode); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaCreateBinaryDescriptor((CudaHandle_t) handle, (BinaryCudaDescriptor_t *) desc_ptr, c, a, b, mode); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopBinary(infiniopBinaryDescriptor_t desc, void *c, void const *a, void const *b, void *stream) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuBinary((BinaryCpuDescriptor_t) desc, c, a, b, stream); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaBinary((BinaryCudaDescriptor_t) desc, c, a, b, stream); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopDestroyBinaryDescriptor(infiniopBinaryDescriptor_t desc) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuDestroyBinaryDescriptor((BinaryCpuDescriptor_t) desc); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaDestroyBinaryDescriptor((BinaryCudaDescriptor_t) desc); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif + } + return STATUS_BAD_DEVICE; +} diff --git a/src/ops/div/operator.cc b/src/ops/div/operator.cc new file mode 100644 index 00000000..794ffa05 --- /dev/null +++ b/src/ops/div/operator.cc @@ -0,0 +1,44 @@ +#include "../binary/binary.h" +#include "../utils.h" +#include "ops/div/div.h" +#include "tensor/tensor_descriptor.h" + +struct _DivDescriptor { + Device device; + infiniopBinaryDescriptor_t binary_desc; +}; + +typedef struct _DivDescriptor *_DivDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateDivDescriptor(infiniopHandle_t handle, + infiniopDivDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + // binary desc + infiniopBinaryDescriptor_t binary_desc; + CHECK_STATUS(infiniopCreateBinaryDescriptor(handle, &binary_desc, c_desc, a_desc, b_desc, BinaryMode::Divide), STATUS_SUCCESS); + + *(_DivDescriptor_t *) desc_ptr = new _DivDescriptor{ + handle->device, + binary_desc, + }; + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDiv(infiniopDivDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream) { + auto _desc = (_DivDescriptor_t) desc; + CHECK_STATUS(infiniopBinary(_desc->binary_desc, c, a, b, stream), STATUS_SUCCESS); + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc) { + CHECK_STATUS(infiniopDestroyBinaryDescriptor(((_DivDescriptor_t) desc)->binary_desc), STATUS_SUCCESS); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/max/operator.cc b/src/ops/max/operator.cc new file mode 100644 index 00000000..2e79eaa8 --- /dev/null +++ b/src/ops/max/operator.cc @@ -0,0 +1,44 @@ +#include "../binary/binary.h" +#include "../utils.h" +#include "ops/max/max.h" +#include "tensor/tensor_descriptor.h" + +struct _MaxDescriptor { + Device device; + infiniopBinaryDescriptor_t binary_desc; +}; + +typedef struct _MaxDescriptor *_MaxDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateMaxDescriptor(infiniopHandle_t handle, + infiniopMaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + // binary desc + infiniopBinaryDescriptor_t binary_desc; + CHECK_STATUS(infiniopCreateBinaryDescriptor(handle, &binary_desc, c_desc, a_desc, b_desc, BinaryMode::Max), STATUS_SUCCESS); + + *(_MaxDescriptor_t *) desc_ptr = new _MaxDescriptor{ + handle->device, + binary_desc, + }; + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopMax(infiniopMaxDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream) { + auto _desc = (_MaxDescriptor_t) desc; + CHECK_STATUS(infiniopBinary(_desc->binary_desc, c, a, b, stream), STATUS_SUCCESS); + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroyMaxDescriptor(infiniopMaxDescriptor_t desc) { + CHECK_STATUS(infiniopDestroyBinaryDescriptor(((_MaxDescriptor_t) desc)->binary_desc), STATUS_SUCCESS); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/min/operator.cc b/src/ops/min/operator.cc new file mode 100644 index 00000000..f6fd0924 --- /dev/null +++ b/src/ops/min/operator.cc @@ -0,0 +1,44 @@ +#include "../binary/binary.h" +#include "../utils.h" +#include "ops/min/min.h" +#include "tensor/tensor_descriptor.h" + +struct _MinDescriptor { + Device device; + infiniopBinaryDescriptor_t binary_desc; +}; + +typedef struct _MinDescriptor *_MinDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateMinDescriptor(infiniopHandle_t handle, + infiniopMinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + // binary desc + infiniopBinaryDescriptor_t binary_desc; + CHECK_STATUS(infiniopCreateBinaryDescriptor(handle, &binary_desc, c_desc, a_desc, b_desc, BinaryMode::Min), STATUS_SUCCESS); + + *(_MinDescriptor_t *) desc_ptr = new _MinDescriptor{ + handle->device, + binary_desc, + }; + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopMin(infiniopMinDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream) { + auto _desc = (_MinDescriptor_t) desc; + CHECK_STATUS(infiniopBinary(_desc->binary_desc, c, a, b, stream), STATUS_SUCCESS); + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroyMinDescriptor(infiniopMinDescriptor_t desc) { + CHECK_STATUS(infiniopDestroyBinaryDescriptor(((_MinDescriptor_t) desc)->binary_desc), STATUS_SUCCESS); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/mod/operator.cc b/src/ops/mod/operator.cc new file mode 100644 index 00000000..3b92d729 --- /dev/null +++ b/src/ops/mod/operator.cc @@ -0,0 +1,44 @@ +#include "../binary/binary.h" +#include "../utils.h" +#include "ops/mod/mod.h" +#include "tensor/tensor_descriptor.h" + +struct _ModDescriptor { + Device device; + infiniopBinaryDescriptor_t binary_desc; +}; + +typedef struct _ModDescriptor *_ModDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateModDescriptor(infiniopHandle_t handle, + infiniopModDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + // binary desc + infiniopBinaryDescriptor_t binary_desc; + CHECK_STATUS(infiniopCreateBinaryDescriptor(handle, &binary_desc, c_desc, a_desc, b_desc, BinaryMode::Mod), STATUS_SUCCESS); + + *(_ModDescriptor_t *) desc_ptr = new _ModDescriptor{ + handle->device, + binary_desc, + }; + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopMod(infiniopModDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream) { + auto _desc = (_ModDescriptor_t) desc; + CHECK_STATUS(infiniopBinary(_desc->binary_desc, c, a, b, stream), STATUS_SUCCESS); + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroyModDescriptor(infiniopModDescriptor_t desc) { + CHECK_STATUS(infiniopDestroyBinaryDescriptor(((_ModDescriptor_t) desc)->binary_desc), STATUS_SUCCESS); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/mul/operator.cc b/src/ops/mul/operator.cc new file mode 100644 index 00000000..6c115c37 --- /dev/null +++ b/src/ops/mul/operator.cc @@ -0,0 +1,44 @@ +#include "../binary/binary.h" +#include "../utils.h" +#include "ops/mul/mul.h" +#include "tensor/tensor_descriptor.h" + +struct _MulDescriptor { + Device device; + infiniopBinaryDescriptor_t binary_desc; +}; + +typedef struct _MulDescriptor *_MulDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateMulDescriptor(infiniopHandle_t handle, + infiniopMulDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + // binary desc + infiniopBinaryDescriptor_t binary_desc; + CHECK_STATUS(infiniopCreateBinaryDescriptor(handle, &binary_desc, c_desc, a_desc, b_desc, BinaryMode::Multiply), STATUS_SUCCESS); + + *(_MulDescriptor_t *) desc_ptr = new _MulDescriptor{ + handle->device, + binary_desc, + }; + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopMul(infiniopMulDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream) { + auto _desc = (_MulDescriptor_t) desc; + CHECK_STATUS(infiniopBinary(_desc->binary_desc, c, a, b, stream), STATUS_SUCCESS); + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc) { + CHECK_STATUS(infiniopDestroyBinaryDescriptor(((_MulDescriptor_t) desc)->binary_desc), STATUS_SUCCESS); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/pow/operator.cc b/src/ops/pow/operator.cc new file mode 100644 index 00000000..4a47f3e3 --- /dev/null +++ b/src/ops/pow/operator.cc @@ -0,0 +1,44 @@ +#include "../binary/binary.h" +#include "../utils.h" +#include "ops/pow/pow.h" +#include "tensor/tensor_descriptor.h" + +struct _PowDescriptor { + Device device; + infiniopBinaryDescriptor_t binary_desc; +}; + +typedef struct _PowDescriptor *_PowDescriptor_t; + +__C __export infiniopStatus_t infiniopCreatePowDescriptor(infiniopHandle_t handle, + infiniopPowDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + // binary desc + infiniopBinaryDescriptor_t binary_desc; + CHECK_STATUS(infiniopCreateBinaryDescriptor(handle, &binary_desc, c_desc, a_desc, b_desc, BinaryMode::Pow), STATUS_SUCCESS); + + *(_PowDescriptor_t *) desc_ptr = new _PowDescriptor{ + handle->device, + binary_desc, + }; + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopPow(infiniopPowDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream) { + auto _desc = (_PowDescriptor_t) desc; + CHECK_STATUS(infiniopBinary(_desc->binary_desc, c, a, b, stream), STATUS_SUCCESS); + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroyPowDescriptor(infiniopPowDescriptor_t desc) { + CHECK_STATUS(infiniopDestroyBinaryDescriptor(((_PowDescriptor_t) desc)->binary_desc), STATUS_SUCCESS); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/sub/operator.cc b/src/ops/sub/operator.cc new file mode 100644 index 00000000..0943548d --- /dev/null +++ b/src/ops/sub/operator.cc @@ -0,0 +1,44 @@ +#include "../binary/binary.h" +#include "../utils.h" +#include "ops/sub/sub.h" +#include "tensor/tensor_descriptor.h" + +struct _SubDescriptor { + Device device; + infiniopBinaryDescriptor_t binary_desc; +}; + +typedef struct _SubDescriptor *_SubDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateSubDescriptor(infiniopHandle_t handle, + infiniopSubDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + // binary desc + infiniopBinaryDescriptor_t binary_desc; + CHECK_STATUS(infiniopCreateBinaryDescriptor(handle, &binary_desc, c_desc, a_desc, b_desc, BinaryMode::Subtract), STATUS_SUCCESS); + + *(_SubDescriptor_t *) desc_ptr = new _SubDescriptor{ + handle->device, + binary_desc, + }; + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopSub(infiniopSubDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream) { + auto _desc = (_SubDescriptor_t) desc; + CHECK_STATUS(infiniopBinary(_desc->binary_desc, c, a, b, stream), STATUS_SUCCESS); + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroySubDescriptor(infiniopSubDescriptor_t desc) { + CHECK_STATUS(infiniopDestroyBinaryDescriptor(((_SubDescriptor_t) desc)->binary_desc), STATUS_SUCCESS); + delete desc; + return STATUS_SUCCESS; +} From 79715a1f0b175f67c98d489ac22e63fc00070c05 Mon Sep 17 00:00:00 2001 From: Zimin Li Date: Mon, 25 Nov 2024 15:31:43 +0800 Subject: [PATCH 2/5] Add corresponding include statements to infini_operators.h and optimized vecN and binary kernel --- include/infini_operators.h | 12 ++++++++++-- src/ops/binary/cuda/binary.cu | 34 +++++++++++++++++++++++----------- 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/include/infini_operators.h b/include/infini_operators.h index 906a3771..8e8fdf4d 100644 --- a/include/infini_operators.h +++ b/include/infini_operators.h @@ -2,16 +2,24 @@ #include "ops/add/add.h" #include "ops/attention/attention.h" #include "ops/causal_softmax/causal_softmax.h" -#include "ops/global_avg_pool/global_avg_pool.h" +#include "ops/conv/conv.h" +#include "ops/div/div.h" #include "ops/expand/expand.h" #include "ops/gemm/gemm.h" -#include "ops/conv/conv.h" +#include "ops/global_avg_pool/global_avg_pool.h" #include "ops/matmul/matmul.h" +#include "ops/max/max.h" +#include "ops/min/min.h" #include "ops/mlp/mlp.h" +#include "ops/mod/mod.h" +#include "ops/mul/mul.h" +#include "ops/pow/pow.h" #include "ops/random_sample/random_sample.h" #include "ops/rearrange/rearrange.h" #include "ops/relu/relu.h" #include "ops/rms_norm/rms_norm.h" #include "ops/rotary_embedding/rotary_embedding.h" +#include "ops/sub/sub.h" #include "ops/swiglu/swiglu.h" +#include "ops/transpose/transpose.h" #include "tensor/tensor_descriptor.h" diff --git a/src/ops/binary/cuda/binary.cu b/src/ops/binary/cuda/binary.cu index a690ad14..55ddadee 100644 --- a/src/ops/binary/cuda/binary.cu +++ b/src/ops/binary/cuda/binary.cu @@ -1,6 +1,7 @@ #include "../../../devices/cuda/common_cuda.h" #include "../../utils.h" #include "binary.cuh" +#include /** * @brief A templated vector type that supports binary elementwise operation. @@ -16,14 +17,15 @@ struct vecN { __device__ __forceinline__ vecN compute(const vecN &other, BinaryMode::Mode mode) const { vecN result; +#pragma unroll for (int i = 0; i < N; ++i) { - if constexpr (std::is_same::value) { + if constexpr (std::is_same_v) { result[i] = binary_op(data[i], other[i], mode); } else { - auto data_ = reinterpret_cast *>(result.data); - auto this_data = reinterpret_cast *>(data); - auto other_data = reinterpret_cast *>(other.data); - data_[i] = binary_op(this_data[i], other_data[i], mode); + auto &data_ = reinterpret_cast *>(result.data)[i]; + const auto this_data = reinterpret_cast *>(data)[i]; + const auto other_data = reinterpret_cast *>(other.data)[i]; + data_ = binary_op(this_data, other_data, mode); } } return result; @@ -89,7 +91,13 @@ __device__ __forceinline__ Tdata binary_op(const Tdata &a, const Tdata &b, int m switch (mode) { // Arithmetic operations: case BinaryMode::Add: - return a + b; + if constexpr (std::is_same_v) { + return __hadd2_rn(a, b); + } else if constexpr (std::is_same_v) { + return __fadd_rn(a, b); + } else { + return a + b; + } case BinaryMode::Subtract: return a - b; case BinaryMode::Multiply: @@ -199,7 +207,9 @@ __global__ void binary( } return; } - c[idx] = binary_op(a[idx], b[idx], mode); + Tdata a_data = a[idx]; + Tdata b_data = b[idx]; + c[idx] = binary_op(a_data, b_data, mode); } } @@ -238,10 +248,12 @@ infiniopStatus_t binary_nv_gpu(BinaryCudaDescriptor_t desc, void *c, void const _binary_nv_gpu(desc, c_vec, a_vec, b_vec, data_size, pack_size, 0, stream); const auto remainder = desc->c_data_size % pack_size; - const auto a_ = reinterpret_cast(a); - const auto b_ = reinterpret_cast(b); - const auto c_ = reinterpret_cast(c); - _binary_nv_gpu(desc, c_, a_, b_, remainder, 1, data_size * pack_size, stream); + if (remainder > 0) { + const auto a_ = reinterpret_cast(a); + const auto b_ = reinterpret_cast(b); + const auto c_ = reinterpret_cast(c); + _binary_nv_gpu(desc, c_, a_, b_, remainder, 1, data_size * pack_size, stream); + } cudaDeviceSynchronize(); return STATUS_SUCCESS; } From 1dc21fc40f065f7b5bc558480b8838560d660b5f Mon Sep 17 00:00:00 2001 From: Zimin Li Date: Tue, 26 Nov 2024 10:28:52 +0800 Subject: [PATCH 3/5] Add specific functions for add, sub, and mul for half2 and float --- src/ops/binary/cuda/binary.cu | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/ops/binary/cuda/binary.cu b/src/ops/binary/cuda/binary.cu index 55ddadee..5be4b714 100644 --- a/src/ops/binary/cuda/binary.cu +++ b/src/ops/binary/cuda/binary.cu @@ -99,9 +99,21 @@ __device__ __forceinline__ Tdata binary_op(const Tdata &a, const Tdata &b, int m return a + b; } case BinaryMode::Subtract: - return a - b; + if constexpr (std::is_same_v) { + return __hsub2(a, b); + } else if constexpr (std::is_same_v) { + return __fsub_rn(a, b); + } else { + return a - b; + } case BinaryMode::Multiply: - return a * b; + if constexpr (std::is_same_v) { + return __hmul2(a, b); + } else if constexpr (std::is_same_v) { + return __fmul_rd(a, b); + } else { + return a * b; + } case BinaryMode::Divide: return a / b; case BinaryMode::Pow: From b1d4ac3409ad74a46ac678ec2cdd8d0455e74993 Mon Sep 17 00:00:00 2001 From: Zimin Li Date: Thu, 28 Nov 2024 14:52:04 +0800 Subject: [PATCH 4/5] Add check_error to the tests, optimize various operators --- operatorspy/tests/add.py | 12 ++- operatorspy/tests/div.py | 12 ++- operatorspy/tests/max.py | 12 ++- operatorspy/tests/min.py | 76 ++------------- operatorspy/tests/mod.py | 171 +++++++++++++++++++++++++--------- operatorspy/tests/mul.py | 12 ++- operatorspy/tests/pow.py | 12 ++- operatorspy/tests/sub.py | 12 ++- src/ops/binary/cuda/binary.cu | 50 +++++++--- 9 files changed, 219 insertions(+), 150 deletions(-) diff --git a/operatorspy/tests/add.py b/operatorspy/tests/add.py index b4d2203e..0217d7d2 100644 --- a/operatorspy/tests/add.py +++ b/operatorspy/tests/add.py @@ -95,14 +95,18 @@ def test( ) for i in range(NUM_PRERUN if PROFILE else 1): - lib.infiniopAdd( - descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + check_error( + lib.infiniopAdd( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) ) if PROFILE: start_time = time.time() for i in range(NUM_ITERATIONS): - lib.infiniopAdd( - descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + check_error( + lib.infiniopAdd( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) ) elapsed = (time.time() - start_time) / NUM_ITERATIONS print(f" lib time: {elapsed :6f}") diff --git a/operatorspy/tests/div.py b/operatorspy/tests/div.py index c303c635..37ebbc84 100644 --- a/operatorspy/tests/div.py +++ b/operatorspy/tests/div.py @@ -114,14 +114,18 @@ def test( ) for i in range(NUM_PRERUN if PROFILE else 1): - lib.infiniopDiv( - descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + check_error( + lib.infiniopDiv( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) ) if PROFILE: start_time = time.time() for i in range(NUM_ITERATIONS): - lib.infiniopDiv( - descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + check_error( + lib.infiniopDiv( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) ) elapsed = (time.time() - start_time) / NUM_ITERATIONS print(f" lib time: {elapsed :6f}") diff --git a/operatorspy/tests/max.py b/operatorspy/tests/max.py index 39eaf2b3..2efeb651 100644 --- a/operatorspy/tests/max.py +++ b/operatorspy/tests/max.py @@ -95,14 +95,18 @@ def test( ) for i in range(NUM_PRERUN if PROFILE else 1): - lib.infiniopMax( - descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + check_error( + lib.infiniopMax( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) ) if PROFILE: start_time = time.time() for i in range(NUM_ITERATIONS): - lib.infiniopMax( - descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + check_error( + lib.infiniopMax( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) ) elapsed = (time.time() - start_time) / NUM_ITERATIONS print(f" lib time: {elapsed :6f}") diff --git a/operatorspy/tests/min.py b/operatorspy/tests/min.py index 050ea64a..68923344 100644 --- a/operatorspy/tests/min.py +++ b/operatorspy/tests/min.py @@ -95,14 +95,18 @@ def test( ) for i in range(NUM_PRERUN if PROFILE else 1): - lib.infiniopMin( - descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + check_error( + lib.infiniopMin( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) ) if PROFILE: start_time = time.time() for i in range(NUM_ITERATIONS): - lib.infiniopMin( - descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + check_error( + lib.infiniopMin( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) ) elapsed = (time.time() - start_time) / NUM_ITERATIONS print(f" lib time: {elapsed :6f}") @@ -124,66 +128,4 @@ def test_cuda(lib, test_cases): device = DeviceEnum.DEVICE_CUDA handle = create_handle(lib, device) for c_shape, a_shape, b_shape, inplace in test_cases: - test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) - test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) - destroy_handle(lib, handle) - - -def test_bang(lib, test_cases): - import torch_mlu - - device = DeviceEnum.DEVICE_BANG - handle = create_handle(lib, device) - for c_shape, a_shape, b_shape, inplace in test_cases: - test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) - test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) - destroy_handle(lib, handle) - - -if __name__ == "__main__": - test_cases = [ - # c_shape, a_shape, b_shape, inplace - # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE), - # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE), - ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE), - ((), (), (), Inplace.OUT_OF_PLACE), - ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE), - ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE), - ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE), - ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE), - ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE), - ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), - ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), - ] - args = get_args() - lib = open_lib() - lib.infiniopCreateMinDescriptor.restype = c_int32 - lib.infiniopCreateMinDescriptor.argtypes = [ - infiniopHandle_t, - POINTER(infiniopMinDescriptor_t), - infiniopTensorDescriptor_t, - infiniopTensorDescriptor_t, - infiniopTensorDescriptor_t, - ] - lib.infiniopMin.restype = c_int32 - lib.infiniopMin.argtypes = [ - infiniopMinDescriptor_t, - c_void_p, - c_void_p, - c_void_p, - c_void_p, - ] - lib.infiniopDestroyMinDescriptor.restype = c_int32 - lib.infiniopDestroyMinDescriptor.argtypes = [ - infiniopMinDescriptor_t, - ] - - if args.cpu: - test_cpu(lib, test_cases) - if args.cuda: - test_cuda(lib, test_cases) - if args.bang: - test_bang(lib, test_cases) - if not (args.cpu or args.cuda or args.bang): - test_cpu(lib, test_cases) - print("\033[92mTest passed!\033[0m") + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace= \ No newline at end of file diff --git a/operatorspy/tests/mod.py b/operatorspy/tests/mod.py index 342e3662..9092f90b 100644 --- a/operatorspy/tests/mod.py +++ b/operatorspy/tests/mod.py @@ -27,6 +27,7 @@ NUM_PRERUN = 10 NUM_ITERATIONS = 1000 + class Inplace(Enum): OUT_OF_PLACE = auto() INPLACE_A = auto() @@ -40,9 +41,8 @@ class ModDescriptor(Structure): infiniopModDescriptor_t = POINTER(ModDescriptor) -def find_and_print_differing_indices( - a, b, c, ans, atol=0, rtol=1e-2 -): + +def find_and_print_differing_indices(a, b, c, ans, atol=0, rtol=1e-2): if c.shape != ans.shape: raise ValueError("Tensors must have the same shape to compare.") @@ -63,7 +63,7 @@ def find_and_print_differing_indices( def mod(x, y): if PROFILE: ans = torch.fmod(x, y) - torch.cuda.synchronize() + torch.cuda.synchronize() return ans return torch.fmod(x, y) @@ -72,8 +72,8 @@ def test( lib, handle, torch_device, - c_shape, - a_shape, + c_shape, + a_shape, b_shape, tensor_dtype=torch.float16, inplace=Inplace.OUT_OF_PLACE, @@ -86,9 +86,17 @@ def test( return a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device) * 10 - b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) if inplace != Inplace.INPLACE_AB else a - c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b) - + b = ( + torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) + if inplace != Inplace.INPLACE_AB + else a + ) + c = ( + torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) + if inplace == Inplace.OUT_OF_PLACE + else (a if inplace == Inplace.INPLACE_A else b) + ) + for i in range(NUM_PRERUN if PROFILE else 1): ans = mod(a, b) if PROFILE: @@ -100,7 +108,11 @@ def test( a_tensor = to_tensor(a, lib) b_tensor = to_tensor(b, lib) if inplace != Inplace.INPLACE_AB else a_tensor - c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor) + c_tensor = ( + to_tensor(c, lib) + if inplace == Inplace.OUT_OF_PLACE + else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor) + ) descriptor = infiniopModDescriptor_t() check_error( @@ -112,16 +124,20 @@ def test( b_tensor.descriptor, ) ) - + for i in range(NUM_PRERUN if PROFILE else 1): - lib.infiniopMod( - descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + check_error( + lib.infiniopMod( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) ) if PROFILE: start_time = time.time() for i in range(NUM_ITERATIONS): - lib.infiniopMod( - descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + check_error( + lib.infiniopMod( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) ) elapsed = (time.time() - start_time) / NUM_ITERATIONS print(f" lib time: {elapsed :6f}") @@ -134,8 +150,26 @@ def test_cpu(lib, test_cases): device = DeviceEnum.DEVICE_CPU handle = create_handle(lib, device) for c_shape, a_shape, b_shape, inplace in test_cases: - test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) - test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + test( + lib, + handle, + "cpu", + c_shape, + a_shape, + b_shape, + tensor_dtype=torch.float16, + inplace=inplace, + ) + test( + lib, + handle, + "cpu", + c_shape, + a_shape, + b_shape, + tensor_dtype=torch.float32, + inplace=inplace, + ) destroy_handle(lib, handle) @@ -143,8 +177,26 @@ def test_cuda(lib, test_cases): device = DeviceEnum.DEVICE_CUDA handle = create_handle(lib, device) for c_shape, a_shape, b_shape, inplace in test_cases: - test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) - test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + test( + lib, + handle, + "cuda", + c_shape, + a_shape, + b_shape, + tensor_dtype=torch.float16, + inplace=inplace, + ) + test( + lib, + handle, + "cuda", + c_shape, + a_shape, + b_shape, + tensor_dtype=torch.float32, + inplace=inplace, + ) destroy_handle(lib, handle) @@ -154,8 +206,26 @@ def test_bang(lib, test_cases): device = DeviceEnum.DEVICE_BANG handle = create_handle(lib, device) for c_shape, a_shape, b_shape, inplace in test_cases: - test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) - test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + test( + lib, + handle, + "mlu", + c_shape, + a_shape, + b_shape, + tensor_dtype=torch.float16, + inplace=inplace, + ) + test( + lib, + handle, + "mlu", + c_shape, + a_shape, + b_shape, + tensor_dtype=torch.float32, + inplace=inplace, + ) destroy_handle(lib, handle) @@ -169,10 +239,41 @@ def test_bang(lib, test_cases): ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE), ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE), ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.OUT_OF_PLACE), - ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE), - ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE), - ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), - ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ( + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) + if not PROFILE + else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE) + ), + ( + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) + if not PROFILE + else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE) + ), + ( + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) + if not PROFILE + else ( + (32, 256, 112, 112), + (32, 256, 112, 1), + (32, 256, 112, 112), + Inplace.OUT_OF_PLACE, + ) + ), + ( + ( + (32, 3, 112, 112), + (32, 3, 112, 112), + (32, 3, 112, 112), + Inplace.OUT_OF_PLACE, + ) + if not PROFILE + else ( + (32, 256, 112, 112), + (32, 256, 112, 112), + (32, 256, 112, 112), + Inplace.OUT_OF_PLACE, + ) + ), ] args = get_args() lib = open_lib() @@ -187,22 +288,4 @@ def test_bang(lib, test_cases): lib.infiniopMod.restype = c_int32 lib.infiniopMod.argtypes = [ infiniopModDescriptor_t, - c_void_p, - c_void_p, - c_void_p, - c_void_p, - ] - lib.infiniopDestroyModDescriptor.restype = c_int32 - lib.infiniopDestroyModDescriptor.argtypes = [ - infiniopModDescriptor_t, - ] - - if args.cpu: - test_cpu(lib, test_cases) - if args.cuda: - test_cuda(lib, test_cases) - if args.bang: - test_bang(lib, test_cases) - if not (args.cpu or args.cuda or args.bang): - test_cpu(lib, test_cases) - print("\033[92mTest passed!\033[0m") + \ No newline at end of file diff --git a/operatorspy/tests/mul.py b/operatorspy/tests/mul.py index 57085b42..11baabb9 100644 --- a/operatorspy/tests/mul.py +++ b/operatorspy/tests/mul.py @@ -95,14 +95,18 @@ def test( ) for i in range(NUM_PRERUN if PROFILE else 1): - lib.infiniopMul( - descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + check_error( + lib.infiniopMul( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) ) if PROFILE: start_time = time.time() for i in range(NUM_ITERATIONS): - lib.infiniopMul( - descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + check_error( + lib.infiniopMul( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) ) elapsed = (time.time() - start_time) / NUM_ITERATIONS print(f" lib time: {elapsed :6f}") diff --git a/operatorspy/tests/pow.py b/operatorspy/tests/pow.py index e5106b36..7e1a72ee 100644 --- a/operatorspy/tests/pow.py +++ b/operatorspy/tests/pow.py @@ -92,14 +92,18 @@ def test( ) for i in range(NUM_PRERUN if PROFILE else 1): - lib.infiniopPow( - descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + check_error( + lib.infiniopPow( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) ) if PROFILE: start_time = time.time() for i in range(NUM_ITERATIONS): - lib.infiniopPow( - descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + check_error( + lib.infiniopPow( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) ) elapsed = (time.time() - start_time) / NUM_ITERATIONS print(f" lib time: {elapsed :6f}") diff --git a/operatorspy/tests/sub.py b/operatorspy/tests/sub.py index cc8eb406..ef68b1a4 100644 --- a/operatorspy/tests/sub.py +++ b/operatorspy/tests/sub.py @@ -95,14 +95,18 @@ def test( ) for i in range(NUM_PRERUN if PROFILE else 1): - lib.infiniopSub( - descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + check_error( + lib.infiniopSub( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) ) if PROFILE: start_time = time.time() for i in range(NUM_ITERATIONS): - lib.infiniopSub( - descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + check_error( + lib.infiniopSub( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) ) elapsed = (time.time() - start_time) / NUM_ITERATIONS print(f" lib time: {elapsed :6f}") diff --git a/src/ops/binary/cuda/binary.cu b/src/ops/binary/cuda/binary.cu index 5be4b714..68ea712a 100644 --- a/src/ops/binary/cuda/binary.cu +++ b/src/ops/binary/cuda/binary.cu @@ -115,12 +115,26 @@ __device__ __forceinline__ Tdata binary_op(const Tdata &a, const Tdata &b, int m return a * b; } case BinaryMode::Divide: - return a / b; + if constexpr (std::is_same_v) { + return __h2div(a, b); + } else if constexpr (std::is_same_v) { + return __fdividef(a, b); + } else { + return a / b; + } + // return a / b; case BinaryMode::Pow: if constexpr (std::is_arithmetic_v) { - return std::pow(a, b); + if constexpr (std::is_same_v) { + return __powf(a, b); + } else { + return std::pow(a, b); + } } else if constexpr (std::is_same_v) { - return __float2half(std::pow(__half2float(a), __half2float(b))); + float a_ = __half2float(a); + float b_ = __half2float(b); + float ans_f = __powf(a_, b_); + return __float2half(isnan(ans_f) ? std::pow(a_, b_) : ans_f); } else if constexpr (!std::is_same_v) { return a.pow(b); } @@ -128,10 +142,10 @@ __device__ __forceinline__ Tdata binary_op(const Tdata &a, const Tdata &b, int m case BinaryMode::Mod: if constexpr (std::is_floating_point_v) { return std::fmod(a, b); - } else if constexpr (std::is_arithmetic_v) { - return a % b; } else if constexpr (std::is_same_v) { return __float2half(std::fmod(__half2float(a), __half2float(b))); + } else if constexpr (std::is_arithmetic_v) { + return a % b; } else if constexpr (!std::is_same_v) { return a.mod(b); } @@ -144,8 +158,10 @@ __device__ __forceinline__ Tdata binary_op(const Tdata &a, const Tdata &b, int m return std::max(a, b); } } else if constexpr (std::is_same_v) { - return __half2float(a) > __half2float(b) ? a : b; - } else if constexpr (!std::is_same_v) { + return __hmax(a, b); + } else if constexpr (std::is_same_v) { + return __hmax2(a, b); + } else { return a.max(b); } case BinaryMode::Min: @@ -156,35 +172,37 @@ __device__ __forceinline__ Tdata binary_op(const Tdata &a, const Tdata &b, int m return std::min(a, b); } } else if constexpr (std::is_same_v) { - return __half2float(a) < __half2float(b) ? a : b; - } else if constexpr (!std::is_same_v) { + return __hmin(a, b); + } else if constexpr (std::is_same_v) { + return __hmin2(a, b); + } else { return a.min(b); } case BinaryMode::BitwiseAnd: if constexpr (std::is_integral_v) { return a & b; } else { - printf("Error: Non-integral input to bitwise operatior\n"); + printf("Error: Non-integral input to bitwise operator\n"); return a; } case BinaryMode::BitwiseOr: if constexpr (std::is_integral_v) { return a | b; } else { - printf("Error: Non-integral input to bitwise operatior\n"); + printf("Error: Non-integral input to bitwise operator\n"); return a; } case BinaryMode::BitwiseXor: if constexpr (std::is_integral_v) { return a ^ b; } else { - printf("Error: Non-integral input to bitwise operatior\n"); + printf("Error: Non-integral input to bitwise operator\n"); return a; } // Logical operations: // TODO. Currently Not supported yet default: - printf("Unrecognized binary operation: how did you get here??\n"); + printf("Unrecognized binary operation\n"); return a; } } @@ -282,7 +300,10 @@ infiniopStatus_t cudaBinary(BinaryCudaDescriptor_t desc, case BinaryMode::Divide: return binary_nv_gpu, half, 256>(desc, c, a, b, stream, 8); case BinaryMode::Pow: - return binary_nv_gpu, half, 128>(desc, c, a, b, stream, 8); + return binary_nv_gpu, half, 128>(desc, c, a, b, stream, 16); + case BinaryMode::Max: + case BinaryMode::Min: + return binary_nv_gpu, half, 256>(desc, c, a, b, stream, 8); default: return binary_nv_gpu, half, 64>(desc, c, a, b, stream, 8); } @@ -295,7 +316,6 @@ infiniopStatus_t cudaBinary(BinaryCudaDescriptor_t desc, case BinaryMode::Divide: return binary_nv_gpu, float, 256>(desc, c, a, b, stream, 4); case BinaryMode::Pow: - return binary_nv_gpu, float, 256>(desc, c, a, b, stream, 8); case BinaryMode::Mod: return binary_nv_gpu, float, 128>(desc, c, a, b, stream, 4); case BinaryMode::Max: From 8d7be0ca166413991ca2a7581583f0ec8f19d290 Mon Sep 17 00:00:00 2001 From: Zimin Li Date: Thu, 28 Nov 2024 15:05:13 +0800 Subject: [PATCH 5/5] Fixed incomplete test files --- operatorspy/tests/min.py | 64 +++++++++++++++++++- operatorspy/tests/mod.py | 125 +++++++++------------------------------ 2 files changed, 92 insertions(+), 97 deletions(-) diff --git a/operatorspy/tests/min.py b/operatorspy/tests/min.py index 68923344..548f4819 100644 --- a/operatorspy/tests/min.py +++ b/operatorspy/tests/min.py @@ -128,4 +128,66 @@ def test_cuda(lib, test_cases): device = DeviceEnum.DEVICE_CUDA handle = create_handle(lib, device) for c_shape, a_shape, b_shape, inplace in test_cases: - test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace= \ No newline at end of file + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_bang(lib, test_cases): + import torch_mlu + + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # c_shape, a_shape, b_shape, inplace + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE), + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE), + ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE), + ((), (), (), Inplace.OUT_OF_PLACE), + ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE), + ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE), + ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateMinDescriptor.restype = c_int32 + lib.infiniopCreateMinDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopMinDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopMin.restype = c_int32 + lib.infiniopMin.argtypes = [ + infiniopMinDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyMinDescriptor.restype = c_int32 + lib.infiniopDestroyMinDescriptor.argtypes = [ + infiniopMinDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if not (args.cpu or args.cuda or args.bang): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") \ No newline at end of file diff --git a/operatorspy/tests/mod.py b/operatorspy/tests/mod.py index 9092f90b..7bf5beaf 100644 --- a/operatorspy/tests/mod.py +++ b/operatorspy/tests/mod.py @@ -150,26 +150,8 @@ def test_cpu(lib, test_cases): device = DeviceEnum.DEVICE_CPU handle = create_handle(lib, device) for c_shape, a_shape, b_shape, inplace in test_cases: - test( - lib, - handle, - "cpu", - c_shape, - a_shape, - b_shape, - tensor_dtype=torch.float16, - inplace=inplace, - ) - test( - lib, - handle, - "cpu", - c_shape, - a_shape, - b_shape, - tensor_dtype=torch.float32, - inplace=inplace, - ) + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) destroy_handle(lib, handle) @@ -177,26 +159,8 @@ def test_cuda(lib, test_cases): device = DeviceEnum.DEVICE_CUDA handle = create_handle(lib, device) for c_shape, a_shape, b_shape, inplace in test_cases: - test( - lib, - handle, - "cuda", - c_shape, - a_shape, - b_shape, - tensor_dtype=torch.float16, - inplace=inplace, - ) - test( - lib, - handle, - "cuda", - c_shape, - a_shape, - b_shape, - tensor_dtype=torch.float32, - inplace=inplace, - ) + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) destroy_handle(lib, handle) @@ -206,26 +170,8 @@ def test_bang(lib, test_cases): device = DeviceEnum.DEVICE_BANG handle = create_handle(lib, device) for c_shape, a_shape, b_shape, inplace in test_cases: - test( - lib, - handle, - "mlu", - c_shape, - a_shape, - b_shape, - tensor_dtype=torch.float16, - inplace=inplace, - ) - test( - lib, - handle, - "mlu", - c_shape, - a_shape, - b_shape, - tensor_dtype=torch.float32, - inplace=inplace, - ) + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) destroy_handle(lib, handle) @@ -239,41 +185,10 @@ def test_bang(lib, test_cases): ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE), ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE), ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.OUT_OF_PLACE), - ( - ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) - if not PROFILE - else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE) - ), - ( - ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) - if not PROFILE - else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE) - ), - ( - ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) - if not PROFILE - else ( - (32, 256, 112, 112), - (32, 256, 112, 1), - (32, 256, 112, 112), - Inplace.OUT_OF_PLACE, - ) - ), - ( - ( - (32, 3, 112, 112), - (32, 3, 112, 112), - (32, 3, 112, 112), - Inplace.OUT_OF_PLACE, - ) - if not PROFILE - else ( - (32, 256, 112, 112), - (32, 256, 112, 112), - (32, 256, 112, 112), - Inplace.OUT_OF_PLACE, - ) - ), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), ] args = get_args() lib = open_lib() @@ -288,4 +203,22 @@ def test_bang(lib, test_cases): lib.infiniopMod.restype = c_int32 lib.infiniopMod.argtypes = [ infiniopModDescriptor_t, - \ No newline at end of file + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyModDescriptor.restype = c_int32 + lib.infiniopDestroyModDescriptor.argtypes = [ + infiniopModDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if not (args.cpu or args.cuda or args.bang): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") \ No newline at end of file