Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ enum DeviceEnum {
DevNvGpu,
DevCambriconMlu,
DevAscendNpu,
DevKunlunXpu,
};

typedef enum DeviceEnum Device;
Expand Down
1 change: 1 addition & 0 deletions operatorspy/devices.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ class DeviceEnum:
DEVICE_CUDA = 1
DEVICE_BANG = 2
DEVICE_ASCEND = 3
DEVICE_KUNLUN = 4
38 changes: 37 additions & 1 deletion operatorspy/tests/matmul.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,40 @@ def test_ascend(lib, test_cases):
)

destroy_handle(lib, handle)


def test_kunlun(lib, test_cases):
import torch_xmlir
device = DeviceEnum.DEVICE_KUNLUN
handle = create_handle(lib, device)

for (
alpha,
beta,
a_shape,
b_shape,
c_shape,
a_stride,
b_stride,
c_stride,
dtype,
) in test_cases:
test(
lib,
handle,
"cuda",
alpha,
beta,
a_shape,
b_shape,
c_shape,
a_stride,
b_stride,
c_stride,
dtype,
)

destroy_handle(lib, handle)

if __name__ == "__main__":
test_cases = [
Expand Down Expand Up @@ -350,6 +384,8 @@ def test_ascend(lib, test_cases):
test_bang(lib, test_cases)
if args.ascend:
test_ascend(lib, test_cases)
if not (args.cpu or args.cuda or args.bang or args.ascend):
if args.kunlun:
test_kunlun(lib, test_cases)
if not (args.cpu or args.cuda or args.bang or args.ascend or args.kunlun):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
5 changes: 5 additions & 0 deletions operatorspy/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,10 @@ def get_args():
action="store_true",
help="Run ASCEND NPU test",
)
parser.add_argument(
"--kunlun",
action="store_true",
help="Run kunlun test",
)

return parser.parse_args()
13 changes: 13 additions & 0 deletions src/devices/handle.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
#ifdef ENABLE_ASCEND_NPU
#include "./ascend/ascend_handle.h"
#endif
#ifdef ENABLE_KUNLUN_XPU
#include "./kunlun/kunlun_handle.h"
#endif


__C infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, Device device, int device_id) {
Expand Down Expand Up @@ -40,6 +43,11 @@ __C infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, Device d
case DevAscendNpu: {
return createAscendHandle((AscendHandle_t *) handle_ptr, device_id);
}
#endif
#ifdef ENABLE_KUNLUN_XPU
case DevKunlunXpu: {
return createKunlunHandle((KunlunHandle_t *) handle_ptr, device_id);
}
#endif
}
return STATUS_BAD_DEVICE;
Expand Down Expand Up @@ -68,6 +76,11 @@ __C infiniopStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
case DevAscendNpu: {
return deleteAscendHandle((AscendHandle_t) handle);
}
#endif
#ifdef ENABLE_KUNLUN_XPU
case DevKunlunXpu: {
return deleteKunlunHandle((KunlunHandle_t) handle);
}
#endif
}
return STATUS_BAD_DEVICE;
Expand Down
20 changes: 20 additions & 0 deletions src/devices/kunlun/common_kunlun.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#ifndef __COMMON_KUNLUN_H__
#define __COMMON_KUNLUN_H__

#include "xpu/runtime.h"
#include "xpu/runtime_ex.h"
#include "xpu/xdnn.h"

namespace xdnn = baidu::xpu::api;

#define checkKUNLUNError(call) \
{ \
auto err = call; \
if (XPU_SUCCESS != err) { \
fprintf(stderr, "KUNLUN error in %s:%i : %s.\n", __FILE__, \
__LINE__, xpu_strerror(err)); \
exit(EXIT_FAILURE); \
} \
}

#endif
31 changes: 31 additions & 0 deletions src/devices/kunlun/kunlun_handle.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#include "kunlun_handle.h"

infiniopStatus_t createKunlunHandle(KunlunHandle_t *handle_ptr, int device_id) {
int device_count;
xpu_device_count(&device_count);
if (device_id >= device_count) {
return STATUS_BAD_DEVICE;
}

auto pool = std::make_shared<Pool<xdnnHandle_t>>();
if (xpu_set_device(device_id) != XPU_SUCCESS) {
return STATUS_BAD_DEVICE;
}
xdnnHandle_t handle = xdnn::create_context();
pool->push(std::move(handle));

*handle_ptr = new KunlunContext {
DevKunlunXpu,
device_id,
std::move(pool),
};

return STATUS_SUCCESS;
}

infiniopStatus_t deleteKunlunHandle(KunlunHandle_t handle_ptr) {
handle_ptr->xdnn_handles_t = nullptr;
delete handle_ptr;

return STATUS_SUCCESS;
}
37 changes: 37 additions & 0 deletions src/devices/kunlun/kunlun_handle.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#ifndef __KUNLUN_HANDLE_H__
#define __KUNLUN_HANDLE_H__

#include "../pool.h"
#include "common_kunlun.h"
#include "device.h"
#include "status.h"

typedef xdnn::Context *xdnnHandle_t;

struct KunlunContext {
Device device;
int device_id;
std::shared_ptr<Pool<xdnnHandle_t>> xdnn_handles_t;
};
typedef struct KunlunContext *KunlunHandle_t;

infiniopStatus_t createKunlunHandle(KunlunHandle_t *handle_ptr, int device_id);

infiniopStatus_t deleteKunlunHandle(KunlunHandle_t handle_ptr);

template<typename T>
void use_xdnn(std::shared_ptr<Pool<xdnnHandle_t>> xdnn_handles_t,
int device_id,
XPUStream stream,
T const &f) {
auto handle = xdnn_handles_t->pop();
if (!handle) {
xpu_set_device(device_id);
*handle = xdnn::create_context();
}
(*handle)->set_stream(stream);
f(*handle);
xdnn_handles_t->push(std::move(*handle));
}

#endif
118 changes: 118 additions & 0 deletions src/ops/matmul/kunlun/matmul_kunlun.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#include "matmul_kunlun.h"
#include "../../../devices/kunlun/common_kunlun.h"
#include "../../utils.h"

template<typename T>
infiniopStatus_t matmul_kunlun(MatmulKunlunDescriptor_t desc,
void *c,
float beta,
void const *a,
void const *b,
float alpha,
void *stream) {
auto info = desc->info;

if (info.is_transed) {
std::swap(a, b);
}

auto transA = info.a_matrix.col_stride == 1 ? false : true;
auto transB = info.b_matrix.col_stride == 1 ? false : true;
// int64_t strideA = transA ? info.a_matrix.col_stride * info.a_matrix.cols
// : info.a_matrix.row_stride * info.a_matrix.rows;
// int64_t strideB = transB ? info.b_matrix.col_stride * info.b_matrix.cols
// : info.b_matrix.row_stride * info.b_matrix.rows;
// int64_t strideC = info.batch == 1
// ? info.c_matrix.row_stride * info.c_matrix.rows
// : info.c_matrix.stride;
use_xdnn(desc->xdnn_handles_t,
desc->device_id,
(XPUStream) stream,
[&](xdnnHandle_t handle) {
// xdnn::fc_batched<T, T, T, int16_t>(handle, info.batch, transA,transB,info.m,info.n,info.k,alpha,(T *) a,strideA,(T *) b,strideB,beta,(T *) c,strideC,nullptr,nullptr);
for (int i = 0; i < info.batch; i++) {
checkKUNLUNError((
xdnn::fc_fusion<T, T, T, int16_t>(
handle,
(T *) ((char *) a + i * info.a_matrix.stride * (desc->dtype).size),
(T *) ((char *) b + i * info.b_matrix.stride * (desc->dtype).size),
(T *) ((char *) c + i * info.c_matrix.stride * (desc->dtype).size),
info.m,
info.n,
info.k,
transA,
transB,
nullptr,
nullptr,
nullptr,
info.a_matrix.ld(),
info.b_matrix.ld(),
info.c_matrix.ld(),
alpha,
beta,
nullptr,
xdnn::Activation_t::LINEAR,
nullptr)));
}
});
return STATUS_SUCCESS;
}


infiniopStatus_t kunlunCreateMatmulDescriptor(KunlunHandle_t handle,
MatmulKunlunDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
float alpha,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc,
float beta) {
DT dtype = c_desc->dt;

if (dtype != F16 && dtype != F32) {
return STATUS_BAD_TENSOR_DTYPE;
}

infiniopStatus_t *status = new infiniopStatus_t{STATUS_EXECUTION_FAILED};
auto info = MatmulInfo(c_desc, a_desc, b_desc, status, false);
if (*status != STATUS_SUCCESS) {
return *status;
}

*desc_ptr = new MatmulKunlunDescriptor{
DevKunlunXpu,
dtype,
handle->device_id,
info,
alpha,
beta,
handle->xdnn_handles_t};
return STATUS_SUCCESS;
}

infiniopStatus_t kunlunMatmul(MatmulKunlunDescriptor_t desc,
void *workspace,
uint64_t workspace_size,
void *c,
void const *a,
void const *b,
void *stream) {
if (desc->dtype == F16) {
return matmul_kunlun<float16>(desc, c, desc->beta, a, b, desc->alpha, stream);
}
if (desc->dtype == F32) {
return matmul_kunlun<float>(desc, c, desc->beta, a, b, desc->alpha, stream);
}
return STATUS_BAD_TENSOR_DTYPE;
}


infiniopStatus_t kunlunGetMatmulWorkspaceSize(MatmulKunlunDescriptor_t desc, uint64_t *size) {
*size = 0;
return STATUS_SUCCESS;
}

infiniopStatus_t kunlunDestroyMatmulDescriptor(MatmulKunlunDescriptor_t desc) {
desc->xdnn_handles_t = nullptr;
delete desc;
return STATUS_SUCCESS;
}
41 changes: 41 additions & 0 deletions src/ops/matmul/kunlun/matmul_kunlun.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#ifndef __KUNLUN_MATMUL_H__
#define __KUNLUN_MATMUL_H__

#include "../../../devices/kunlun/kunlun_handle.h"
#include "../../utils.h"
#include "../blas.h"
#include "operators.h"

typedef struct MatmulKunlunDescriptor {
Device device;
DT dtype;
int device_id;
MatmulInfo info;
float alpha;
float beta;
std::shared_ptr<Pool<xdnnHandle_t>> xdnn_handles_t;
} MatmulKunlunDescriptor;

typedef struct MatmulKunlunDescriptor *MatmulKunlunDescriptor_t;

infiniopStatus_t kunlunCreateMatmulDescriptor(KunlunHandle_t handle,
MatmulKunlunDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
float alpha,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc,
float beta);

infiniopStatus_t kunlunGetMatmulWorkspaceSize(MatmulKunlunDescriptor_t desc, uint64_t *size);

infiniopStatus_t kunlunMatmul(MatmulKunlunDescriptor_t desc,
void *workspace,
uint64_t workspace_size,
void *c,
void const *a,
void const *b,
void *stream);

infiniopStatus_t kunlunDestroyMatmulDescriptor(MatmulKunlunDescriptor_t desc);

#endif
Loading
Loading