InfiniTensor
diff --git a/‎include/infini_operators.h‎
Lines changed: 1 addition & 0 deletions b/‎include/infini_operators.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/ops/global_avg_pool/global_avg_pool.h‎
Lines changed: 26 additions & 0 deletions b/‎include/ops/global_avg_pool/global_avg_pool.h‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎operatorspy/tests/global_avg_pool.py‎
Lines changed: 197 additions & 0 deletions b/‎operatorspy/tests/global_avg_pool.py‎
Lines changed: 197 additions & 0 deletions
diff --git a/‎src/ops/global_avg_pool/cpu/global_avg_pool_cpu.cc‎
Lines changed: 84 additions & 0 deletions b/‎src/ops/global_avg_pool/cpu/global_avg_pool_cpu.cc‎
Lines changed: 84 additions & 0 deletions
diff --git a/‎src/ops/global_avg_pool/cpu/global_avg_pool_cpu.h‎
Lines changed: 29 additions & 0 deletions b/‎src/ops/global_avg_pool/cpu/global_avg_pool_cpu.h‎
Lines changed: 29 additions & 0 deletions
@@ -2,6 +2,7 @@
 #include "ops/add/add.h"
 #include "ops/attention/attention.h"
 #include "ops/causal_softmax/causal_softmax.h"
+#include "ops/global_avg_pool/global_avg_pool.h"
 #include "ops/expand/expand.h"
 #include "ops/gemm/gemm.h"
 #include "ops/conv/conv.h"
 
@@ -0,0 +1,26 @@
+#ifndef GLOBAL_AVG_POOL_H
+#define GLOBAL_AVG_POOL_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct GlobalAvgPoolDescriptor {
+    Device device;
+} GlobalAvgPoolDescriptor;
+
+typedef GlobalAvgPoolDescriptor *infiniopGlobalAvgPoolDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateGlobalAvgPoolDescriptor(infiniopHandle_t handle,
+                                                                    infiniopGlobalAvgPoolDescriptor_t *desc_ptr,
+                                                                    infiniopTensorDescriptor_t y,
+                                                                    infiniopTensorDescriptor_t x);
+
+__C __export infiniopStatus_t infiniopGetGlobalAvgPoolWorkspaceSize(infiniopGlobalAvgPoolDescriptor_t desc, uint64_t *size);
+
+__C __export infiniopStatus_t infiniopGlobalAvgPool(infiniopGlobalAvgPoolDescriptor_t desc,
+                                                    void *workspace, uint64_t workspace_size,
+                                                    void *y, void const *x, void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyGlobalAvgPoolDescriptor(infiniopGlobalAvgPoolDescriptor_t desc);
+
+#endif
@@ -0,0 +1,197 @@
+from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch, time
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function,
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class GlobalAvgPoolDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopGlobalAvgPoolDescriptor_t = POINTER(GlobalAvgPoolDescriptor)
+
+
+def inferShape(x):
+    return x.shape[:2] + (1,) * (x.dim() - 2)
+
+
+def globalAvgPool(x):
+    y = torch.mean(x, dim=tuple(range(2, x.dim())), keepdim=True)
+    if PROFILE:
+        torch.cuda.synchronize()
+    return y.view(*inferShape(x))
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    x_shape,
+    tensor_dtype=torch.float16,
+):
+    print(
+        f"Testing GlobalAvgPool on {torch_device} with input tensor_shape: {x_shape} dtype: {tensor_dtype}"
+    )
+
+    x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
+    y = torch.zeros(inferShape(x), dtype=tensor_dtype).to(torch_device)
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = globalAvgPool(x)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = globalAvgPool(x)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+    
+    x_tensor = to_tensor(x, lib)
+    y_tensor = to_tensor(y, lib)
+    descriptor = infiniopGlobalAvgPoolDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateGlobalAvgPoolDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+        )
+    )
+    workspaceSize = ctypes.c_uint64(0)
+    check_error(
+        lib.infiniopGetGlobalAvgPoolWorkspaceSize(
+            descriptor, ctypes.byref(workspaceSize)
+        )
+    )
+    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
+        torch_device
+    )
+    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
+
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(
+            lib.infiniopGlobalAvgPool(
+                descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None
+            )
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            lib.infiniopGlobalAvgPool(
+                descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+    
+    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyGlobalAvgPoolDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for x_shape in test_cases:
+        test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float16)
+        test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for x_shape in test_cases:
+        test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float16)
+        test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for x_shape in test_cases:
+        test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float16)
+        test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # x_shape
+        ((1, 3, 3)),
+        ((1, 3, 1, 1, 3)),
+        ((1, 3, 1, 1, 257)),
+        ((1, 2, 1, 1, 514)),
+        ((1, 3, 1, 1, 1025)),
+        ((32, 256, 1, 112, 112)),
+        ((2, 3, 2048000)),
+        ((2, 1, 10243)),
+        ((2, 20, 100)),
+        ((3, 33, 333)),
+        ((32, 20, 512)),
+        ((3, 3, 11, 11, 11, 3, 2)),
+        ((32, 256, 1, 112, 112)),
+        ((32, 256, 112, 112)),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateGlobalAvgPoolDescriptor.restype = c_int32
+    lib.infiniopCreateGlobalAvgPoolDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopGlobalAvgPoolDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetGlobalAvgPoolWorkspaceSize.restype = c_int32
+    lib.infiniopGetGlobalAvgPoolWorkspaceSize.argtypes = [
+        infiniopGlobalAvgPoolDescriptor_t,
+        POINTER(c_uint64),
+    ]
+    lib.infiniopGlobalAvgPool.restype = c_int32
+    lib.infiniopGlobalAvgPool.argtypes = [
+        infiniopGlobalAvgPoolDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyGlobalAvgPoolDescriptor.restype = c_int32
+    lib.infiniopDestroyGlobalAvgPoolDescriptor.argtypes = [
+        infiniopGlobalAvgPoolDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
@@ -0,0 +1,84 @@
+#include "global_avg_pool_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../utils.h"
+
+infiniopStatus_t cpuCreateGlobalAvgPoolDescriptor(infiniopHandle_t,
+                                                  GlobalAvgPoolCpuDescriptor_t *desc_ptr,
+                                                  infiniopTensorDescriptor_t y,
+                                                  infiniopTensorDescriptor_t x) {
+    uint64_t ndim = y->ndim;
+    if (ndim < 2 || ndim != x->ndim) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    for (size_t i = 0; i < ndim; ++i) {
+        if (i < 2 && y->shape[i] != x->shape[i]) {
+            return STATUS_BAD_TENSOR_SHAPE;
+        } else if (i >= 2 && y->shape[i] != 1) {
+            return STATUS_BAD_TENSOR_SHAPE;
+        }
+    }
+    if (!is_contiguous(y) || !is_contiguous(x)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (y->dt != F16 && y->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (y->dt != x->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    uint64_t y_data_size = std::accumulate(y->shape, y->shape + 2, 1ULL, std::multiplies<uint64_t>());
+    uint64_t x_per_NC_data_size = std::accumulate(x->shape + 2, x->shape + ndim, 1ULL, std::multiplies<uint64_t>());
+
+    *desc_ptr = new GlobalAvgPoolCpuDescriptor{
+        DevCpu,
+        y->dt,
+        y_data_size,
+        x_per_NC_data_size,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuGetGlobalAvgPoolWorkspaceSize(GlobalAvgPoolCpuDescriptor_t desc, uint64_t *size) {
+    *size = 0;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuDestroyGlobalAvgPoolDescriptor(GlobalAvgPoolCpuDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
+
+template<typename Tdata>
+infiniopStatus_t global_avg_pool_cpu(GlobalAvgPoolCpuDescriptor_t desc, void *y, void const *x) {
+    auto x_ = reinterpret_cast<Tdata const *>(x);
+    auto y_ = reinterpret_cast<Tdata *>(y);
+    const auto x_size = desc->x_per_NC_data_size;
+
+#pragma omp parallel for
+    for (uint64_t i = 0; i < desc->y_data_size; ++i) {
+        if constexpr (std::is_same<Tdata, uint16_t>::value) {
+            float sum = std::accumulate(x_ + i * x_size, x_ + (i + 1) * x_size, 0.0f,
+                                        [](float res, uint16_t value) {
+                                            return res + f16_to_f32(value);
+                                        });
+            y_[i] = f32_to_f16(sum / x_size);
+        } else {
+            y_[i] = std::accumulate(x_ + i * x_size, x_ + (i + 1) * x_size, Tdata(0)) / x_size;
+        }
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuGlobalAvgPool(GlobalAvgPoolCpuDescriptor_t desc,
+                                  void *workspace, uint64_t workspace_size, void *y, void const *x,
+                                  void *stream) {
+    if (desc->dtype == F16) {
+        return global_avg_pool_cpu<uint16_t>(desc, y, x);
+    }
+    if (desc->dtype == F32) {
+        return global_avg_pool_cpu<float>(desc, y, x);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
@@ -0,0 +1,29 @@
+#ifndef __CPU_GLOBAL_AVG_POOL_H__
+#define __CPU_GLOBAL_AVG_POOL_H__
+
+#include "operators.h"
+#include <numeric>
+
+struct GlobalAvgPoolCpuDescriptor {
+    Device device;
+    DT dtype;
+    uint64_t y_data_size;
+    uint64_t x_per_NC_data_size;
+};
+
+typedef struct GlobalAvgPoolCpuDescriptor *GlobalAvgPoolCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateGlobalAvgPoolDescriptor(infiniopHandle_t,
+                                                  GlobalAvgPoolCpuDescriptor_t *,
+                                                  infiniopTensorDescriptor_t y,
+                                                  infiniopTensorDescriptor_t x);
+
+infiniopStatus_t cpuGetGlobalAvgPoolWorkspaceSize(GlobalAvgPoolCpuDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t cpuGlobalAvgPool(GlobalAvgPoolCpuDescriptor_t desc,
+                                  void *workspace, uint64_t workspace_size, void *y, void const *x,
+                                  void *stream);
+
+infiniopStatus_t cpuDestroyGlobalAvgPoolDescriptor(GlobalAvgPoolCpuDescriptor_t desc);
+
+#endif