Merge pull request #68 from PanZezhong1725/add_relu

PanZezhong1725 · web-flow · commit 49ee9f2b779f · 2024-11-06T10:08:22.000+08:00
Add ReLU CPU and CUDA implementation
diff --git a/include/infini_operators.h b/include/infini_operators.h
@@ -7,6 +7,7 @@
 #include "ops/mlp/mlp.h"
 #include "ops/random_sample/random_sample.h"
 #include "ops/rearrange/rearrange.h"
+#include "ops/relu/relu.h"
 #include "ops/rms_norm/rms_norm.h"
 #include "ops/rotary_embedding/rotary_embedding.h"
 #include "ops/swiglu/swiglu.h"
diff --git a/include/ops/relu/relu.h b/include/ops/relu/relu.h
@@ -0,0 +1,25 @@
+#ifndef RELU_H
+#define RELU_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct ReluDescriptor {
+    Device device;
+} ReluDescriptor;
+
+typedef ReluDescriptor *infiniopReluDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateReluDescriptor(infiniopHandle_t handle,
+                                                           infiniopReluDescriptor_t *desc_ptr,
+                                                           infiniopTensorDescriptor_t y,
+                                                           infiniopTensorDescriptor_t x);
+
+__C __export infiniopStatus_t infiniopRelu(infiniopReluDescriptor_t desc,
+                                           void *y,
+                                           void const *x,
+                                           void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyReluDescriptor(infiniopReluDescriptor_t desc);
+
+#endif
diff --git a/operatorspy/tests/relu.py b/operatorspy/tests/relu.py
@@ -0,0 +1,175 @@
+from ctypes import POINTER, Structure, c_int32, c_void_p
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function,
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+class ReluDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopReluDescriptor_t = POINTER(ReluDescriptor)
+
+
+def relu(x):
+    if PROFILE:
+        ans = torch.nn.functional.relu(x).to(x.dtype)
+        torch.cuda.synchronize()
+        return ans
+    return torch.nn.functional.relu(x).to(x.dtype)
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    tensor_shape, 
+    tensor_dtype=torch.float16,
+    inplace=Inplace.OUT_OF_PLACE,
+):
+    print(
+        f"Testing Relu on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
+    )
+
+    x = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1
+    y = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else x
+    
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = relu(x)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = relu(x)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    x_tensor = to_tensor(x, lib)
+    y_tensor = to_tensor(y, lib) if inplace == Inplace.OUT_OF_PLACE else x_tensor
+    descriptor = infiniopReluDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateReluDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+        )
+    )
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        lib.infiniopRelu(
+            descriptor, y_tensor.data, x_tensor.data, None
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            lib.infiniopRelu(
+                descriptor, y_tensor.data, x_tensor.data, None
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+    
+    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyReluDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for tensor_shape, inplace in test_cases:
+        test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for tensor_shape, inplace in test_cases:
+        test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for tensor_shape, inplace in test_cases:
+        test(lib, handle, "mlu", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "mlu", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # tensor_shape, inplace
+        ((), Inplace.OUT_OF_PLACE),
+        ((), Inplace.INPLACE_X),
+        ((1, 3), Inplace.OUT_OF_PLACE),
+        ((3, 3), Inplace.OUT_OF_PLACE),
+        ((3, 3, 13, 9, 17), Inplace.INPLACE_X),
+        ((32, 20, 512), Inplace.INPLACE_X),
+        ((33, 333, 333), Inplace.OUT_OF_PLACE),
+        ((32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateReluDescriptor.restype = c_int32
+    lib.infiniopCreateReluDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopReluDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopRelu.restype = c_int32
+    lib.infiniopRelu.argtypes = [
+        infiniopReluDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyReluDescriptor.restype = c_int32
+    lib.infiniopDestroyReluDescriptor.argtypes = [
+        infiniopReluDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
+
diff --git a/src/ops/relu/cpu/relu_cpu.cc b/src/ops/relu/cpu/relu_cpu.cc
@@ -0,0 +1,72 @@
+#include "relu_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../utils.h"
+
+infiniopStatus_t cpuCreateReluDescriptor(infiniopHandle_t,
+                                         ReluCpuDescriptor_t *desc_ptr,
+                                         infiniopTensorDescriptor_t y,
+                                         infiniopTensorDescriptor_t x) {
+    uint64_t ndim = y->ndim;
+    if (ndim != x->ndim) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    for (size_t i = 0; i < ndim; ++i) {
+        if (y->shape[i] != x->shape[i]) {
+            return STATUS_BAD_TENSOR_SHAPE;
+        }
+    }
+    if (!is_contiguous(y) || !is_contiguous(x)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (y->dt != F16 && y->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (y->dt != x->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    uint64_t data_size = std::accumulate(y->shape, y->shape + y->ndim, 1ULL, std::multiplies<uint64_t>());
+
+    *desc_ptr = new ReluCpuDescriptor{
+        DevCpu,
+        y->dt,
+        data_size,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuDestroyReluDescriptor(ReluCpuDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
+
+template<typename Tdata>
+infiniopStatus_t relu_cpu(ReluCpuDescriptor_t desc, void *y, void const *x) {
+    auto x_ = reinterpret_cast<Tdata const *>(x);
+    auto y_ = reinterpret_cast<Tdata *>(y);
+
+#pragma omp parallel for
+    for (uint64_t i = 0; i < desc->data_size; ++i) {
+        if constexpr (std::is_same<Tdata, uint16_t>::value) {
+            float x_f32 = f16_to_f32(x_[i]);
+            y_[i] = f32_to_f16(x_f32 < 0 ? 0 : x_f32);
+        } else {
+            Tdata x_val = x_[i];
+            y_[i] = x_val < 0 ? 0 : x_val;
+        }
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuRelu(ReluCpuDescriptor_t desc,
+                         void *y, void const *x,
+                         void *stream) {
+    if (desc->dtype == F16) {
+        return relu_cpu<uint16_t>(desc, y, x);
+    }
+    if (desc->dtype == F32) {
+        return relu_cpu<float>(desc, y, x);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/relu/cpu/relu_cpu.h b/src/ops/relu/cpu/relu_cpu.h
@@ -0,0 +1,26 @@
+#ifndef __CPU_RELU_H__
+#define __CPU_RELU_H__
+
+#include "operators.h"
+#include <numeric>
+
+struct ReluCpuDescriptor {
+    Device device;
+    DT dtype;
+    uint64_t data_size;
+};
+
+typedef struct ReluCpuDescriptor *ReluCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateReluDescriptor(infiniopHandle_t,
+                                         ReluCpuDescriptor_t *,
+                                         infiniopTensorDescriptor_t y,
+                                         infiniopTensorDescriptor_t x);
+
+infiniopStatus_t cpuRelu(ReluCpuDescriptor_t desc,
+                         void *y, void const *x,
+                         void *stream);
+
+infiniopStatus_t cpuDestroyReluDescriptor(ReluCpuDescriptor_t desc);
+
+#endif
diff --git a/src/ops/relu/cuda/relu.cc b/src/ops/relu/cuda/relu.cc
@@ -0,0 +1,45 @@
+#include "relu.cuh"
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+
+infiniopStatus_t cudaCreateReluDescriptor(CudaHandle_t handle,
+                                          ReluCudaDescriptor_t *desc_ptr,
+                                          infiniopTensorDescriptor_t y,
+                                          infiniopTensorDescriptor_t x) {
+    uint64_t ndim = y->ndim;
+    if (ndim != x->ndim) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    for (size_t i = 0; i < ndim; ++i) {
+        if (y->shape[i] != x->shape[i]) {
+            return STATUS_BAD_TENSOR_SHAPE;
+        }
+    }
+    if (!is_contiguous(y) || !is_contiguous(x)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (y->dt != F16 && y->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (y->dt != x->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    uint64_t data_size = std::accumulate(y->shape, y->shape + y->ndim, 1ULL, std::multiplies<uint64_t>());
+
+    *desc_ptr = new ReluCudaDescriptor{
+        DevNvGpu,
+        y->dt,
+        handle->device_id,
+        ndim,
+        data_size,
+        static_cast<uint64_t>(handle->prop.maxGridSize[0]),
+    };
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaDestroyReluDescriptor(ReluCudaDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/relu/cuda/relu.cu b/src/ops/relu/cuda/relu.cu
diff --git a/src/ops/relu/cuda/relu.cuh b/src/ops/relu/cuda/relu.cuh
diff --git a/src/ops/relu/operator.cc b/src/ops/relu/operator.cc