From 0f060e505c726371d5cbbc23b0fcc1b0c9b14dae Mon Sep 17 00:00:00 2001 From: zhangyue Date: Tue, 7 Jan 2025 15:00:05 +0800 Subject: [PATCH] kunlunxin support and kunlun matmul --- include/device.h | 1 + operatorspy/devices.py | 1 + operatorspy/tests/matmul.py | 38 +++++++- operatorspy/tests/test_utils.py | 5 ++ src/devices/handle.cc | 13 +++ src/devices/kunlun/common_kunlun.h | 20 +++++ src/devices/kunlun/kunlun_handle.cc | 31 +++++++ src/devices/kunlun/kunlun_handle.h | 37 ++++++++ src/ops/matmul/kunlun/matmul_kunlun.cc | 118 +++++++++++++++++++++++++ src/ops/matmul/kunlun/matmul_kunlun.h | 41 +++++++++ src/ops/matmul/operator.cc | 35 ++++++++ xmake.lua | 31 +++++++ 12 files changed, 370 insertions(+), 1 deletion(-) create mode 100644 src/devices/kunlun/common_kunlun.h create mode 100644 src/devices/kunlun/kunlun_handle.cc create mode 100644 src/devices/kunlun/kunlun_handle.h create mode 100644 src/ops/matmul/kunlun/matmul_kunlun.cc create mode 100644 src/ops/matmul/kunlun/matmul_kunlun.h diff --git a/include/device.h b/include/device.h index 701b6632..ec09c893 100644 --- a/include/device.h +++ b/include/device.h @@ -6,6 +6,7 @@ enum DeviceEnum { DevNvGpu, DevCambriconMlu, DevAscendNpu, + DevKunlunXpu, }; typedef enum DeviceEnum Device; diff --git a/operatorspy/devices.py b/operatorspy/devices.py index 4984502a..ee4d50f1 100644 --- a/operatorspy/devices.py +++ b/operatorspy/devices.py @@ -3,3 +3,4 @@ class DeviceEnum: DEVICE_CUDA = 1 DEVICE_BANG = 2 DEVICE_ASCEND = 3 + DEVICE_KUNLUN = 4 diff --git a/operatorspy/tests/matmul.py b/operatorspy/tests/matmul.py index a919b47d..c9570087 100644 --- a/operatorspy/tests/matmul.py +++ b/operatorspy/tests/matmul.py @@ -291,6 +291,40 @@ def test_ascend(lib, test_cases): ) destroy_handle(lib, handle) + + +def test_kunlun(lib, test_cases): + import torch_xmlir + device = DeviceEnum.DEVICE_KUNLUN + handle = create_handle(lib, device) + + for ( + alpha, + beta, + a_shape, + b_shape, + c_shape, + a_stride, + b_stride, + c_stride, + dtype, + ) in test_cases: + test( + lib, + handle, + "cuda", + alpha, + beta, + a_shape, + b_shape, + c_shape, + a_stride, + b_stride, + c_stride, + dtype, + ) + + destroy_handle(lib, handle) if __name__ == "__main__": test_cases = [ @@ -350,6 +384,8 @@ def test_ascend(lib, test_cases): test_bang(lib, test_cases) if args.ascend: test_ascend(lib, test_cases) - if not (args.cpu or args.cuda or args.bang or args.ascend): + if args.kunlun: + test_kunlun(lib, test_cases) + if not (args.cpu or args.cuda or args.bang or args.ascend or args.kunlun): test_cpu(lib, test_cases) print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/test_utils.py b/operatorspy/tests/test_utils.py index a00a91ec..0217831c 100644 --- a/operatorspy/tests/test_utils.py +++ b/operatorspy/tests/test_utils.py @@ -22,5 +22,10 @@ def get_args(): action="store_true", help="Run ASCEND NPU test", ) + parser.add_argument( + "--kunlun", + action="store_true", + help="Run kunlun test", + ) return parser.parse_args() diff --git a/src/devices/handle.cc b/src/devices/handle.cc index 97126a9d..78ba0502 100644 --- a/src/devices/handle.cc +++ b/src/devices/handle.cc @@ -11,6 +11,9 @@ #ifdef ENABLE_ASCEND_NPU #include "./ascend/ascend_handle.h" #endif +#ifdef ENABLE_KUNLUN_XPU +#include "./kunlun/kunlun_handle.h" +#endif __C infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, Device device, int device_id) { @@ -40,6 +43,11 @@ __C infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, Device d case DevAscendNpu: { return createAscendHandle((AscendHandle_t *) handle_ptr, device_id); } +#endif +#ifdef ENABLE_KUNLUN_XPU + case DevKunlunXpu: { + return createKunlunHandle((KunlunHandle_t *) handle_ptr, device_id); + } #endif } return STATUS_BAD_DEVICE; @@ -68,6 +76,11 @@ __C infiniopStatus_t infiniopDestroyHandle(infiniopHandle_t handle) { case DevAscendNpu: { return deleteAscendHandle((AscendHandle_t) handle); } +#endif +#ifdef ENABLE_KUNLUN_XPU + case DevKunlunXpu: { + return deleteKunlunHandle((KunlunHandle_t) handle); + } #endif } return STATUS_BAD_DEVICE; diff --git a/src/devices/kunlun/common_kunlun.h b/src/devices/kunlun/common_kunlun.h new file mode 100644 index 00000000..20d50168 --- /dev/null +++ b/src/devices/kunlun/common_kunlun.h @@ -0,0 +1,20 @@ +#ifndef __COMMON_KUNLUN_H__ +#define __COMMON_KUNLUN_H__ + +#include "xpu/runtime.h" +#include "xpu/runtime_ex.h" +#include "xpu/xdnn.h" + +namespace xdnn = baidu::xpu::api; + +#define checkKUNLUNError(call) \ + { \ + auto err = call; \ + if (XPU_SUCCESS != err) { \ + fprintf(stderr, "KUNLUN error in %s:%i : %s.\n", __FILE__, \ + __LINE__, xpu_strerror(err)); \ + exit(EXIT_FAILURE); \ + } \ + } + +#endif diff --git a/src/devices/kunlun/kunlun_handle.cc b/src/devices/kunlun/kunlun_handle.cc new file mode 100644 index 00000000..f9b475e6 --- /dev/null +++ b/src/devices/kunlun/kunlun_handle.cc @@ -0,0 +1,31 @@ +#include "kunlun_handle.h" + +infiniopStatus_t createKunlunHandle(KunlunHandle_t *handle_ptr, int device_id) { + int device_count; + xpu_device_count(&device_count); + if (device_id >= device_count) { + return STATUS_BAD_DEVICE; + } + + auto pool = std::make_shared>(); + if (xpu_set_device(device_id) != XPU_SUCCESS) { + return STATUS_BAD_DEVICE; + } + xdnnHandle_t handle = xdnn::create_context(); + pool->push(std::move(handle)); + + *handle_ptr = new KunlunContext { + DevKunlunXpu, + device_id, + std::move(pool), + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t deleteKunlunHandle(KunlunHandle_t handle_ptr) { + handle_ptr->xdnn_handles_t = nullptr; + delete handle_ptr; + + return STATUS_SUCCESS; +} diff --git a/src/devices/kunlun/kunlun_handle.h b/src/devices/kunlun/kunlun_handle.h new file mode 100644 index 00000000..b9286b61 --- /dev/null +++ b/src/devices/kunlun/kunlun_handle.h @@ -0,0 +1,37 @@ +#ifndef __KUNLUN_HANDLE_H__ +#define __KUNLUN_HANDLE_H__ + +#include "../pool.h" +#include "common_kunlun.h" +#include "device.h" +#include "status.h" + +typedef xdnn::Context *xdnnHandle_t; + +struct KunlunContext { + Device device; + int device_id; + std::shared_ptr> xdnn_handles_t; +}; +typedef struct KunlunContext *KunlunHandle_t; + +infiniopStatus_t createKunlunHandle(KunlunHandle_t *handle_ptr, int device_id); + +infiniopStatus_t deleteKunlunHandle(KunlunHandle_t handle_ptr); + +template +void use_xdnn(std::shared_ptr> xdnn_handles_t, + int device_id, + XPUStream stream, + T const &f) { + auto handle = xdnn_handles_t->pop(); + if (!handle) { + xpu_set_device(device_id); + *handle = xdnn::create_context(); + } + (*handle)->set_stream(stream); + f(*handle); + xdnn_handles_t->push(std::move(*handle)); +} + +#endif diff --git a/src/ops/matmul/kunlun/matmul_kunlun.cc b/src/ops/matmul/kunlun/matmul_kunlun.cc new file mode 100644 index 00000000..0a165c5d --- /dev/null +++ b/src/ops/matmul/kunlun/matmul_kunlun.cc @@ -0,0 +1,118 @@ +#include "matmul_kunlun.h" +#include "../../../devices/kunlun/common_kunlun.h" +#include "../../utils.h" + +template +infiniopStatus_t matmul_kunlun(MatmulKunlunDescriptor_t desc, + void *c, + float beta, + void const *a, + void const *b, + float alpha, + void *stream) { + auto info = desc->info; + + if (info.is_transed) { + std::swap(a, b); + } + + auto transA = info.a_matrix.col_stride == 1 ? false : true; + auto transB = info.b_matrix.col_stride == 1 ? false : true; + // int64_t strideA = transA ? info.a_matrix.col_stride * info.a_matrix.cols + // : info.a_matrix.row_stride * info.a_matrix.rows; + // int64_t strideB = transB ? info.b_matrix.col_stride * info.b_matrix.cols + // : info.b_matrix.row_stride * info.b_matrix.rows; + // int64_t strideC = info.batch == 1 + // ? info.c_matrix.row_stride * info.c_matrix.rows + // : info.c_matrix.stride; + use_xdnn(desc->xdnn_handles_t, + desc->device_id, + (XPUStream) stream, + [&](xdnnHandle_t handle) { + // xdnn::fc_batched(handle, info.batch, transA,transB,info.m,info.n,info.k,alpha,(T *) a,strideA,(T *) b,strideB,beta,(T *) c,strideC,nullptr,nullptr); + for (int i = 0; i < info.batch; i++) { + checkKUNLUNError(( + xdnn::fc_fusion( + handle, + (T *) ((char *) a + i * info.a_matrix.stride * (desc->dtype).size), + (T *) ((char *) b + i * info.b_matrix.stride * (desc->dtype).size), + (T *) ((char *) c + i * info.c_matrix.stride * (desc->dtype).size), + info.m, + info.n, + info.k, + transA, + transB, + nullptr, + nullptr, + nullptr, + info.a_matrix.ld(), + info.b_matrix.ld(), + info.c_matrix.ld(), + alpha, + beta, + nullptr, + xdnn::Activation_t::LINEAR, + nullptr))); + } + }); + return STATUS_SUCCESS; +} + + +infiniopStatus_t kunlunCreateMatmulDescriptor(KunlunHandle_t handle, + MatmulKunlunDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + float alpha, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + float beta) { + DT dtype = c_desc->dt; + + if (dtype != F16 && dtype != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + + infiniopStatus_t *status = new infiniopStatus_t{STATUS_EXECUTION_FAILED}; + auto info = MatmulInfo(c_desc, a_desc, b_desc, status, false); + if (*status != STATUS_SUCCESS) { + return *status; + } + + *desc_ptr = new MatmulKunlunDescriptor{ + DevKunlunXpu, + dtype, + handle->device_id, + info, + alpha, + beta, + handle->xdnn_handles_t}; + return STATUS_SUCCESS; +} + +infiniopStatus_t kunlunMatmul(MatmulKunlunDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *c, + void const *a, + void const *b, + void *stream) { + if (desc->dtype == F16) { + return matmul_kunlun(desc, c, desc->beta, a, b, desc->alpha, stream); + } + if (desc->dtype == F32) { + return matmul_kunlun(desc, c, desc->beta, a, b, desc->alpha, stream); + } + return STATUS_BAD_TENSOR_DTYPE; +} + + +infiniopStatus_t kunlunGetMatmulWorkspaceSize(MatmulKunlunDescriptor_t desc, uint64_t *size) { + *size = 0; + return STATUS_SUCCESS; +} + +infiniopStatus_t kunlunDestroyMatmulDescriptor(MatmulKunlunDescriptor_t desc) { + desc->xdnn_handles_t = nullptr; + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/matmul/kunlun/matmul_kunlun.h b/src/ops/matmul/kunlun/matmul_kunlun.h new file mode 100644 index 00000000..324700ce --- /dev/null +++ b/src/ops/matmul/kunlun/matmul_kunlun.h @@ -0,0 +1,41 @@ +#ifndef __KUNLUN_MATMUL_H__ +#define __KUNLUN_MATMUL_H__ + +#include "../../../devices/kunlun/kunlun_handle.h" +#include "../../utils.h" +#include "../blas.h" +#include "operators.h" + +typedef struct MatmulKunlunDescriptor { + Device device; + DT dtype; + int device_id; + MatmulInfo info; + float alpha; + float beta; + std::shared_ptr> xdnn_handles_t; +} MatmulKunlunDescriptor; + +typedef struct MatmulKunlunDescriptor *MatmulKunlunDescriptor_t; + +infiniopStatus_t kunlunCreateMatmulDescriptor(KunlunHandle_t handle, + MatmulKunlunDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + float alpha, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + float beta); + +infiniopStatus_t kunlunGetMatmulWorkspaceSize(MatmulKunlunDescriptor_t desc, uint64_t *size); + +infiniopStatus_t kunlunMatmul(MatmulKunlunDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *c, + void const *a, + void const *b, + void *stream); + +infiniopStatus_t kunlunDestroyMatmulDescriptor(MatmulKunlunDescriptor_t desc); + +#endif diff --git a/src/ops/matmul/operator.cc b/src/ops/matmul/operator.cc index 444168b6..ecfb36ea 100644 --- a/src/ops/matmul/operator.cc +++ b/src/ops/matmul/operator.cc @@ -14,6 +14,9 @@ #ifdef ENABLE_ASCEND_NPU #include "ascend/matmul_aclnn.h" #endif +#ifdef ENABLE_KUNLUN_XPU +#include "kunlun/matmul_kunlun.h" +#endif __C infiniopStatus_t infiniopCreateMatmulDescriptor(infiniopHandle_t handle, infiniopMatmulDescriptor_t *desc_ptr, @@ -48,6 +51,17 @@ __C infiniopStatus_t infiniopCreateMatmulDescriptor(infiniopHandle_t handle, beta, 1); } +#endif +#ifdef ENABLE_KUNLUN_XPU + case DevKunlunXpu: { + return kunlunCreateMatmulDescriptor((KunlunHandle_t) handle, + (MatmulKunlunDescriptor_t *) desc_ptr, + c_desc, + alpha, + a_desc, + b_desc, + beta); + } #endif } return STATUS_BAD_DEVICE; @@ -75,6 +89,12 @@ __C infiniopStatus_t infiniopGetMatmulWorkspaceSize(infiniopMatmulDescriptor_t d return aclnnGetMatmulWorkspaceSize((MatmulAclnnDescriptor_t) desc, size); } +#endif +#ifdef ENABLE_KUNLUN_XPU + case DevKunlunXpu: { + return kunlunGetMatmulWorkspaceSize((MatmulKunlunDescriptor_t) desc, + size); + } #endif } return STATUS_BAD_DEVICE; @@ -104,6 +124,16 @@ __C infiniopStatus_t infiniopMatmul(infiniopMatmulDescriptor_t desc, void *works a, b, stream); +#endif +#ifdef ENABLE_KUNLUN_XPU + case DevKunlunXpu: + return kunlunMatmul((MatmulKunlunDescriptor_t) desc, + workspace, + workspace_size, + c, + a, + b, + stream); #endif } return STATUS_BAD_DEVICE; @@ -130,6 +160,11 @@ __C infiniopStatus_t infiniopDestroyMatmulDescriptor(infiniopMatmulDescriptor_t case DevAscendNpu: { return aclnnDestroyMatmulDescriptor((MatmulAclnnDescriptor_t) desc); } +#endif +#ifdef ENABLE_KUNLUN_XPU + case DevKunlunXpu: { + return kunlunDestroyMatmulDescriptor((MatmulKunlunDescriptor_t) desc); + } #endif } return STATUS_BAD_DEVICE; diff --git a/xmake.lua b/xmake.lua index 327e91ef..a8b9111c 100644 --- a/xmake.lua +++ b/xmake.lua @@ -40,6 +40,13 @@ option("ascend-npu") add_defines("ENABLE_ASCEND_NPU") option_end() +option("kunlun-xpu") + set_default(false) + set_showmenu(true) + set_description("Enable or disable Kunlun XPU kernel") + add_defines("ENABLE_KUNLUN_XPU") +option_end() + if is_mode("debug") then add_cxflags("-g -O0") add_defines("DEBUG_MODE") @@ -212,6 +219,27 @@ if has_config("ascend-npu") then target_end() end +if has_config("kunlun-xpu") then + + add_defines("ENABLE_KUNLUN_XPU") + local KUNLUN_HOME = os.getenv("KUNLUN_HOME") + + add_includedirs(KUNLUN_HOME .. "/include") + add_linkdirs(KUNLUN_HOME .. "/lib64") + add_links("xpurt") + add_links("xpuapi") + + target("kunlun-xpu") + set_kind("static") + set_languages("cxx17") + on_install(function (target) end) + -- Add include dirs + add_files("src/devices/kunlun/*.cc", "src/ops/*/kunlun/*.cc") + add_cxflags("-lstdc++ -Wall -Werror -fPIC") + + target_end() +end + target("infiniop") set_kind("shared") @@ -227,6 +255,9 @@ target("infiniop") if has_config("ascend-npu") then add_deps("ascend-npu") end + if has_config("kunlun-xpu") then + add_deps("kunlun-xpu") + end set_languages("cxx17") add_files("src/devices/handle.cc") add_files("src/ops/*/operator.cc")