|
1 | | -// #ifdef USE_CUDA |
| 1 | +#ifdef USE_CUDA |
2 | 2 |
|
3 | | -// #include "../../../src/kernels/rms_normalization/cpu_kernel.hh" |
4 | | -// #include "../../../src/kernels/rms_normalization/cuda_kernel.hh" |
5 | | -// #include "hardware/device_manager.h" |
6 | | -// #include <gtest/gtest.h> |
7 | | -// #include <numeric> |
| 3 | +#include "../../../src/kernels/rms_normalization/cpu_kernel.hh" |
| 4 | +#include "../../../src/kernels/rms_normalization/cuda_kernel.hh" |
| 5 | +#include "hardware/device_manager.h" |
| 6 | +#include <gtest/gtest.h> |
| 7 | +#include <numeric> |
8 | 8 |
|
9 | | -// using namespace refactor; |
10 | | -// using namespace kernel; |
11 | | -// using namespace hardware; |
| 9 | +using namespace refactor; |
| 10 | +using namespace kernel; |
| 11 | +using namespace hardware; |
12 | 12 |
|
13 | | -// TEST(kernel, RmsNormalizationCuda) { |
14 | | -// // build routine |
15 | | -// auto y = Tensor::share(DataType::F32, Shape{2, 3, 4}); |
16 | | -// auto x = Tensor::share(DataType::F32, Shape{2, 3, 4}); |
17 | | -// auto w = Tensor::share(DataType::F32, Shape{4}); |
18 | | -// auto kernel = RmsNormalizationCuda::build(0, *x), |
19 | | -// kCpu = RmsNormalizationCpu::build(0, *x); |
20 | | -// ASSERT_TRUE(kernel && kCpu); |
21 | | -// auto res = runtime::Resources(); |
22 | | -// auto routine = kernel->lower(res).routine, |
23 | | -// rCpu = kCpu->lower(res).routine; |
24 | | -// // malloc |
25 | | -// auto &dev = *device::init(Device::Type::Nvidia, 0, ""); |
26 | | -// auto yGpu = dev.malloc(y->bytesSize()), |
27 | | -// xGpu = dev.malloc(x->bytesSize()), |
28 | | -// wGpu = dev.malloc(w->bytesSize()); |
29 | | -// // put input data |
30 | | -// std::vector<float> y_(y->elementsSize()); |
31 | | -// std::vector<float> x_(x->elementsSize()); |
32 | | -// std::vector<float> w_(w->elementsSize()); |
33 | | -// std::iota(x_.begin(), x_.end(), 0); |
34 | | -// std::iota(w_.begin(), w_.end(), 1); |
35 | | -// xGpu->copyFromHost(x_.data(), x->bytesSize()); |
36 | | -// wGpu->copyFromHost(w_.data(), w->bytesSize()); |
37 | | -// // inference |
38 | | -// { |
39 | | -// void const *inputs[]{*xGpu, *wGpu}; |
40 | | -// void *outputs[]{*yGpu}; |
41 | | -// routine(res, nullptr, inputs, outputs); |
42 | | -// } |
43 | | -// { |
44 | | -// void const *inputs[]{x_.data(), w_.data()}; |
45 | | -// void *outputs[]{y_.data()}; |
46 | | -// rCpu(res, nullptr, inputs, outputs); |
47 | | -// } |
48 | | -// // check |
49 | | -// std::vector<float> result(y->elementsSize()); |
50 | | -// yGpu->copyToHost(result.data(), y->bytesSize()); |
51 | | -// EXPECT_EQ(result, y_); |
52 | | -// } |
| 13 | +TEST(kernel, RmsNormalizationCuda) { |
| 14 | + // build routine |
| 15 | + auto y = Tensor::share(DataType::F32, Shape{2, 3, 4}); |
| 16 | + auto x = Tensor::share(DataType::F32, Shape{2, 3, 4}); |
| 17 | + auto w = Tensor::share(DataType::F32, Shape{4}); |
| 18 | + auto kernel = RmsNormalizationCuda::build(0, *x), |
| 19 | + kCpu = RmsNormalizationCpu::build(0, *x); |
| 20 | + ASSERT_TRUE(kernel && kCpu); |
| 21 | + auto res = runtime::Resources(); |
| 22 | + auto routine = kernel->lower(res).routine, |
| 23 | + rCpu = kCpu->lower(res).routine; |
| 24 | + // malloc |
| 25 | + auto &dev = *device::init(Device::Type::Nvidia, 0, ""); |
| 26 | + auto yGpu = dev.malloc(y->bytesSize()), |
| 27 | + xGpu = dev.malloc(x->bytesSize()), |
| 28 | + wGpu = dev.malloc(w->bytesSize()); |
| 29 | + // put input data |
| 30 | + std::vector<float> y_(y->elementsSize()); |
| 31 | + std::vector<float> x_(x->elementsSize()); |
| 32 | + std::vector<float> w_(w->elementsSize()); |
| 33 | + std::iota(x_.begin(), x_.end(), 0); |
| 34 | + std::iota(w_.begin(), w_.end(), 1); |
| 35 | + xGpu->copyFromHost(x_.data(), x->bytesSize()); |
| 36 | + wGpu->copyFromHost(w_.data(), w->bytesSize()); |
| 37 | + // inference |
| 38 | + { |
| 39 | + void const *inputs[]{*xGpu, *wGpu}; |
| 40 | + void *outputs[]{*yGpu}; |
| 41 | + routine(res, nullptr, inputs, outputs); |
| 42 | + } |
| 43 | + { |
| 44 | + void const *inputs[]{x_.data(), w_.data()}; |
| 45 | + void *outputs[]{y_.data()}; |
| 46 | + rCpu(res, nullptr, inputs, outputs); |
| 47 | + } |
| 48 | + // check |
| 49 | + std::vector<float> result(y->elementsSize()); |
| 50 | + yGpu->copyToHost(result.data(), y->bytesSize()); |
| 51 | + for (auto i : range0_(y_.size())) { |
| 52 | + EXPECT_FLOAT_EQ(result[i], y_[i]); |
| 53 | + } |
| 54 | +} |
53 | 55 |
|
54 | | -// #endif |
| 56 | +#endif |
0 commit comments