InfiniTensor · zhangyue207 · Jan 7, 2025
diff --git a/include/device.h b/include/device.h
@@ -6,6 +6,7 @@ enum DeviceEnum {
     DevNvGpu,
     DevCambriconMlu,
     DevAscendNpu,
+    DevKunlunXpu,
 };
 
 typedef enum DeviceEnum Device;

diff --git a/operatorspy/devices.py b/operatorspy/devices.py
@@ -3,3 +3,4 @@ class DeviceEnum:
     DEVICE_CUDA = 1
     DEVICE_BANG = 2
     DEVICE_ASCEND = 3
+    DEVICE_KUNLUN = 4
diff --git a/operatorspy/tests/matmul.py b/operatorspy/tests/matmul.py
@@ -291,6 +291,40 @@ def test_ascend(lib, test_cases):
         )
 
     destroy_handle(lib, handle)
+
+
+def test_kunlun(lib, test_cases):
+    import torch_xmlir
+    device = DeviceEnum.DEVICE_KUNLUN
+    handle = create_handle(lib, device)
+
+    for (
+        alpha,
+        beta,
+        a_shape,
+        b_shape,
+        c_shape,
+        a_stride,
+        b_stride,
+        c_stride,
+        dtype,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "cuda",
+            alpha,
+            beta,
+            a_shape,
+            b_shape,
+            c_shape,
+            a_stride,
+            b_stride,
+            c_stride,
+            dtype,
+        )
+
+    destroy_handle(lib, handle)
 
 if __name__ == "__main__":
     test_cases = [
@@ -350,6 +384,8 @@ def test_ascend(lib, test_cases):
         test_bang(lib, test_cases)
     if args.ascend:
         test_ascend(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang or args.ascend):
+    if args.kunlun:
+        test_kunlun(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang or args.ascend or args.kunlun):
         test_cpu(lib, test_cases)
     print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/test_utils.py b/operatorspy/tests/test_utils.py
@@ -22,5 +22,10 @@ def get_args():
         action="store_true",
         help="Run ASCEND NPU test",
     )
+    parser.add_argument(
+        "--kunlun",
+        action="store_true",
+        help="Run kunlun test",
+    )
 
     return parser.parse_args()
diff --git a/src/devices/handle.cc b/src/devices/handle.cc
@@ -11,6 +11,9 @@
 #ifdef ENABLE_ASCEND_NPU
 #include "./ascend/ascend_handle.h"
 #endif
+#ifdef ENABLE_KUNLUN_XPU
+#include "./kunlun/kunlun_handle.h"
+#endif
 
 
 __C infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, Device device, int device_id) {
@@ -40,6 +43,11 @@ __C infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, Device d
         case DevAscendNpu: {
             return createAscendHandle((AscendHandle_t *) handle_ptr, device_id);
         }
+#endif
+#ifdef ENABLE_KUNLUN_XPU
+        case DevKunlunXpu: {
+            return createKunlunHandle((KunlunHandle_t *) handle_ptr, device_id);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -68,6 +76,11 @@ __C infiniopStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
         case DevAscendNpu: {
             return deleteAscendHandle((AscendHandle_t) handle);
         }
+#endif
+#ifdef ENABLE_KUNLUN_XPU
+        case DevKunlunXpu: {
+            return deleteKunlunHandle((KunlunHandle_t) handle);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;

diff --git a/src/devices/kunlun/common_kunlun.h b/src/devices/kunlun/common_kunlun.h
@@ -0,0 +1,20 @@
+#ifndef __COMMON_KUNLUN_H__
+#define __COMMON_KUNLUN_H__
+
+#include "xpu/runtime.h"
+#include "xpu/runtime_ex.h"
+#include "xpu/xdnn.h"
+
+namespace xdnn = baidu::xpu::api;
+
+#define checkKUNLUNError(call)                                         \
+    {                                                                  \
+        auto err = call;                                               \
+        if (XPU_SUCCESS != err) {                                      \
+            fprintf(stderr, "KUNLUN error in %s:%i : %s.\n", __FILE__, \
+                    __LINE__, xpu_strerror(err));                      \
+            exit(EXIT_FAILURE);                                        \
+        }                                                              \
+    }
+
+#endif
diff --git a/src/devices/kunlun/kunlun_handle.cc b/src/devices/kunlun/kunlun_handle.cc
@@ -0,0 +1,31 @@
+#include "kunlun_handle.h"
+
+infiniopStatus_t createKunlunHandle(KunlunHandle_t *handle_ptr, int device_id) {
+    int device_count;
+    xpu_device_count(&device_count);
+    if (device_id >= device_count) {
+        return STATUS_BAD_DEVICE;
+    }
+
+    auto pool = std::make_shared<Pool<xdnnHandle_t>>();
+    if (xpu_set_device(device_id) != XPU_SUCCESS) {
+        return STATUS_BAD_DEVICE;
+    }
+    xdnnHandle_t handle = xdnn::create_context();
+    pool->push(std::move(handle));
+
+    *handle_ptr = new KunlunContext {
+        DevKunlunXpu,
+        device_id,
+        std::move(pool),
+    };
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t deleteKunlunHandle(KunlunHandle_t handle_ptr) {
+    handle_ptr->xdnn_handles_t = nullptr;
+    delete handle_ptr;
+
+    return STATUS_SUCCESS;
+}
diff --git a/src/devices/kunlun/kunlun_handle.h b/src/devices/kunlun/kunlun_handle.h
@@ -0,0 +1,37 @@
+#ifndef __KUNLUN_HANDLE_H__
+#define __KUNLUN_HANDLE_H__
+
+#include "../pool.h"
+#include "common_kunlun.h"
+#include "device.h"
+#include "status.h"
+
+typedef xdnn::Context *xdnnHandle_t;
+
+struct KunlunContext {
+    Device device;
+    int device_id;
+    std::shared_ptr<Pool<xdnnHandle_t>> xdnn_handles_t;
+};
+typedef struct KunlunContext *KunlunHandle_t;
+
+infiniopStatus_t createKunlunHandle(KunlunHandle_t *handle_ptr, int device_id);
+
+infiniopStatus_t deleteKunlunHandle(KunlunHandle_t handle_ptr);
+
+template<typename T>
+void use_xdnn(std::shared_ptr<Pool<xdnnHandle_t>> xdnn_handles_t,
+              int device_id,
+              XPUStream stream,
+              T const &f) {
+    auto handle = xdnn_handles_t->pop();
+    if (!handle) {
+        xpu_set_device(device_id);
+        *handle = xdnn::create_context();
+    }
+    (*handle)->set_stream(stream);
+    f(*handle);
+    xdnn_handles_t->push(std::move(*handle));
+}
+
+#endif
diff --git a/src/ops/matmul/kunlun/matmul_kunlun.cc b/src/ops/matmul/kunlun/matmul_kunlun.cc
@@ -0,0 +1,118 @@
+#include "matmul_kunlun.h"
+#include "../../../devices/kunlun/common_kunlun.h"
+#include "../../utils.h"
+
+template<typename T>
+infiniopStatus_t matmul_kunlun(MatmulKunlunDescriptor_t desc,
+                               void *c,
+                               float beta,
+                               void const *a,
+                               void const *b,
+                               float alpha,
+                               void *stream) {
+    auto info = desc->info;
+
+    if (info.is_transed) {
+        std::swap(a, b);
+    }
+
+    auto transA = info.a_matrix.col_stride == 1 ? false : true;
+    auto transB = info.b_matrix.col_stride == 1 ? false : true;
+    // int64_t strideA = transA ? info.a_matrix.col_stride * info.a_matrix.cols
+    //                          : info.a_matrix.row_stride * info.a_matrix.rows;
+    // int64_t strideB = transB ? info.b_matrix.col_stride * info.b_matrix.cols
+    //                          : info.b_matrix.row_stride * info.b_matrix.rows;
+    // int64_t strideC = info.batch == 1
+    //                       ? info.c_matrix.row_stride * info.c_matrix.rows
+    //                       : info.c_matrix.stride;
+    use_xdnn(desc->xdnn_handles_t,
+             desc->device_id,
+             (XPUStream) stream,
+             [&](xdnnHandle_t handle) {
+                 //  xdnn::fc_batched<T, T, T, int16_t>(handle, info.batch, transA,transB,info.m,info.n,info.k,alpha,(T *) a,strideA,(T *) b,strideB,beta,(T *) c,strideC,nullptr,nullptr);
+                 for (int i = 0; i < info.batch; i++) {
+                     checkKUNLUNError((
+                         xdnn::fc_fusion<T, T, T, int16_t>(
+                             handle,
+                             (T *) ((char *) a + i * info.a_matrix.stride * (desc->dtype).size),
+                             (T *) ((char *) b + i * info.b_matrix.stride * (desc->dtype).size),
+                             (T *) ((char *) c + i * info.c_matrix.stride * (desc->dtype).size),
+                             info.m,
+                             info.n,
+                             info.k,
+                             transA,
+                             transB,
+                             nullptr,
+                             nullptr,
+                             nullptr,
+                             info.a_matrix.ld(),
+                             info.b_matrix.ld(),
+                             info.c_matrix.ld(),
+                             alpha,
+                             beta,
+                             nullptr,
+                             xdnn::Activation_t::LINEAR,
+                             nullptr)));
+                 }
+             });
+    return STATUS_SUCCESS;
+}
+
+
+infiniopStatus_t kunlunCreateMatmulDescriptor(KunlunHandle_t handle,
+                                              MatmulKunlunDescriptor_t *desc_ptr,
+                                              infiniopTensorDescriptor_t c_desc,
+                                              float alpha,
+                                              infiniopTensorDescriptor_t a_desc,
+                                              infiniopTensorDescriptor_t b_desc,
+                                              float beta) {
+    DT dtype = c_desc->dt;
+
+    if (dtype != F16 && dtype != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    infiniopStatus_t *status = new infiniopStatus_t{STATUS_EXECUTION_FAILED};
+    auto info = MatmulInfo(c_desc, a_desc, b_desc, status, false);
+    if (*status != STATUS_SUCCESS) {
+        return *status;
+    }
+
+    *desc_ptr = new MatmulKunlunDescriptor{
+        DevKunlunXpu,
+        dtype,
+        handle->device_id,
+        info,
+        alpha,
+        beta,
+        handle->xdnn_handles_t};
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t kunlunMatmul(MatmulKunlunDescriptor_t desc,
+                              void *workspace,
+                              uint64_t workspace_size,
+                              void *c,
+                              void const *a,
+                              void const *b,
+                              void *stream) {
+    if (desc->dtype == F16) {
+        return matmul_kunlun<float16>(desc, c, desc->beta, a, b, desc->alpha, stream);
+    }
+    if (desc->dtype == F32) {
+        return matmul_kunlun<float>(desc, c, desc->beta, a, b, desc->alpha, stream);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
+
+
+infiniopStatus_t kunlunGetMatmulWorkspaceSize(MatmulKunlunDescriptor_t desc, uint64_t *size) {
+    *size = 0;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t kunlunDestroyMatmulDescriptor(MatmulKunlunDescriptor_t desc) {
+    desc->xdnn_handles_t = nullptr;
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/matmul/kunlun/matmul_kunlun.h b/src/ops/matmul/kunlun/matmul_kunlun.h
@@ -0,0 +1,41 @@
+#ifndef __KUNLUN_MATMUL_H__
+#define __KUNLUN_MATMUL_H__
+
+#include "../../../devices/kunlun/kunlun_handle.h"
+#include "../../utils.h"
+#include "../blas.h"
+#include "operators.h"
+
+typedef struct MatmulKunlunDescriptor {
+    Device device;
+    DT dtype;
+    int device_id;
+    MatmulInfo info;
+    float alpha;
+    float beta;
+    std::shared_ptr<Pool<xdnnHandle_t>> xdnn_handles_t;
+} MatmulKunlunDescriptor;
+
+typedef struct MatmulKunlunDescriptor *MatmulKunlunDescriptor_t;
+
+infiniopStatus_t kunlunCreateMatmulDescriptor(KunlunHandle_t handle,
+                                              MatmulKunlunDescriptor_t *desc_ptr,
+                                              infiniopTensorDescriptor_t c_desc,
+                                              float alpha,
+                                              infiniopTensorDescriptor_t a_desc,
+                                              infiniopTensorDescriptor_t b_desc,
+                                              float beta);
+
+infiniopStatus_t kunlunGetMatmulWorkspaceSize(MatmulKunlunDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t kunlunMatmul(MatmulKunlunDescriptor_t desc,
+                              void *workspace,
+                              uint64_t workspace_size,
+                              void *c,
+                              void const *a,
+                              void const *b,
+                              void *stream);
+
+infiniopStatus_t kunlunDestroyMatmulDescriptor(MatmulKunlunDescriptor_t desc);
+
+#endif