fix(kernel): 引入最新版本的 cccl 以支持 nvrtc 与 cub 配合使用

YdrMaster · YdrMaster · commit c462d7ddcfa3 · 2024-01-26T14:04:37.000+08:00
Signed-off-by: YdrMaster &lt;ydrml@hotmail.com&gt;
diff --git a/.gitmodules b/.gitmodules
@@ -19,3 +19,6 @@
 [submodule "src/09python_ffi/pybind11"]
 	path = src/09python_ffi/pybind11
 	url = git@github.com:pybind/pybind11.git
+[submodule "3rd-party/cccl"]
+	path = 3rd-party/cccl
+	url = git@github.com:NVIDIA/cccl.git
diff --git a/3rd-party/cccl b/3rd-party/cccl
@@ -0,0 +1 @@
+Subproject commit b7d4228ab7268ed928984cd61096079bd671d25d
diff --git a/src/04kernel/src/generator/nvrtc_repo.cc b/src/04kernel/src/generator/nvrtc_repo.cc
@@ -2,6 +2,7 @@
 
 #include "nvrtc_repo.h"
 #include "hardware/device_manager.h"
+#include <filesystem>
 #include <nvrtc.h>
 
 #define NVRTC_ASSERT(CALL)                                                 \
@@ -38,9 +39,24 @@ namespace refactor::kernel::nvrtc {
         NVRTC_ASSERT(nvrtcCreateProgram(&prog, code.data(), name.data(), 0, nullptr, nullptr));
 
         std::vector<std::string> opts{"--std=c++17", "--gpu-architecture=compute_80"};
+        {
+            auto proj = std::filesystem::path(__FILE__)
+                            .parent_path()
+                            .parent_path()
+                            .parent_path()
+                            .parent_path()
+                            .parent_path();
+            auto cccl = proj / "3rd-party/cccl";
+            auto cudacxx = cccl / "libcudacxx/include";
+            auto cub = cccl / "cub";
+            ASSERT(std::filesystem::is_directory(cub), "cub not exist");
+            opts.emplace_back(fmt::format("-I{}", cudacxx.c_str()));
+            opts.emplace_back(fmt::format("-I{}", cub.c_str()));
+        }
 #ifdef CUDA_INCLUDE_PATH
         opts.emplace_back(fmt::format("-I{}", CUDA_INCLUDE_PATH));
 #endif
+
         std::vector<const char *> optsPtr(opts.size());
         std::transform(opts.begin(), opts.end(), optsPtr.begin(),
                        [](auto &s) { return s.c_str(); });
diff --git a/src/04kernel/src/kernels/rms_normalization/cuda_kernel.cc b/src/04kernel/src/kernels/rms_normalization/cuda_kernel.cc
@@ -48,31 +48,27 @@ namespace refactor::kernel {
 
     // 0: data type
     // 1: block size
-    // 2: epsilon cast
+    // 2: T -> float
+    // 3: T <- float
     constexpr static const char *TEMPLATE = R"~(
-#include <cub/cub.cuh>
-
-static __device__ __forceinline__ {0:} squareSum({0:} a, {0:} b) {{
-    return a * a + b * b;
-}}
+#include <cub/block/block_reduce.cuh>
 
 extern "C" __global__ void kernel(
-    {0:} *__restrict__ const y,
-    {0:} const *__restrict__ const x,
-    {0:} const *__restrict__ const w,
-    float epsilon_) {{
+    {0:} *__restrict__ y,
+    {0:} const *__restrict__ x,
+    {0:} const *__restrict__ w,
+    float epsilon) {{
 
-    auto epsilon = {2:}(epsilon_);
     x += blockIdx.x * blockDim.x + threadIdx.x;
-    y += blockIdx.x * blockDim.x + threadIdx.x;;
+    y += blockIdx.x * blockDim.x + threadIdx.x;
     w += threadIdx.x;
 
     using BlockReduce = cub::BlockReduce<{0:}, {1:}>;
     __shared__ typename BlockReduce::TempStorage tempStorage;
     __shared__ {0:} rms;
-    auto acc = BlockReduce(tempStorage).Reduce(*x, squareSum);
+    auto acc = BlockReduce(tempStorage).Reduce(*x * *x, cub::Sum());
     if (threadIdx.x == 0) {{
-        rms = rsqrt(acc / blockDim.x + epsilon);
+        rms = {3:}(rsqrt({2:}(acc) / blockDim.x + epsilon));
     }}
     __syncthreads();
 
@@ -96,6 +92,11 @@ extern "C" __global__ void kernel(
           : dataType == DataType::F64  ? "static_cast<float>"
           : dataType == DataType::FP16 ? "__half2float"
           : dataType == DataType::BF16 ? "__bfloat162float"
+          : UNREACHABLEX(const char*, "unreachable"),
+            dataType == DataType::F32  ? ""
+          : dataType == DataType::F64  ? ""
+          : dataType == DataType::FP16 ? "__float2half"
+          : dataType == DataType::BF16 ? "__float2bfloat16"
           : UNREACHABLEX(const char*, "unreachable")
             // clang-format on
         );
diff --git a/src/04kernel/test/kernels/rms_normalization/test_cuda.cpp b/src/04kernel/test/kernels/rms_normalization/test_cuda.cpp
@@ -1,54 +1,56 @@
-﻿// #ifdef USE_CUDA
+﻿#ifdef USE_CUDA
 
-// #include "../../../src/kernels/rms_normalization/cpu_kernel.hh"
-// #include "../../../src/kernels/rms_normalization/cuda_kernel.hh"
-// #include "hardware/device_manager.h"
-// #include <gtest/gtest.h>
-// #include <numeric>
+#include "../../../src/kernels/rms_normalization/cpu_kernel.hh"
+#include "../../../src/kernels/rms_normalization/cuda_kernel.hh"
+#include "hardware/device_manager.h"
+#include <gtest/gtest.h>
+#include <numeric>
 
-// using namespace refactor;
-// using namespace kernel;
-// using namespace hardware;
+using namespace refactor;
+using namespace kernel;
+using namespace hardware;
 
-// TEST(kernel, RmsNormalizationCuda) {
-//     // build routine
-//     auto y = Tensor::share(DataType::F32, Shape{2, 3, 4});
-//     auto x = Tensor::share(DataType::F32, Shape{2, 3, 4});
-//     auto w = Tensor::share(DataType::F32, Shape{4});
-//     auto kernel = RmsNormalizationCuda::build(0, *x),
-//          kCpu = RmsNormalizationCpu::build(0, *x);
-//     ASSERT_TRUE(kernel && kCpu);
-//     auto res = runtime::Resources();
-//     auto routine = kernel->lower(res).routine,
-//          rCpu = kCpu->lower(res).routine;
-//     // malloc
-//     auto &dev = *device::init(Device::Type::Nvidia, 0, "");
-//     auto yGpu = dev.malloc(y->bytesSize()),
-//          xGpu = dev.malloc(x->bytesSize()),
-//          wGpu = dev.malloc(w->bytesSize());
-//     // put input data
-//     std::vector<float> y_(y->elementsSize());
-//     std::vector<float> x_(x->elementsSize());
-//     std::vector<float> w_(w->elementsSize());
-//     std::iota(x_.begin(), x_.end(), 0);
-//     std::iota(w_.begin(), w_.end(), 1);
-//     xGpu->copyFromHost(x_.data(), x->bytesSize());
-//     wGpu->copyFromHost(w_.data(), w->bytesSize());
-//     // inference
-//     {
-//         void const *inputs[]{*xGpu, *wGpu};
-//         void *outputs[]{*yGpu};
-//         routine(res, nullptr, inputs, outputs);
-//     }
-//     {
-//         void const *inputs[]{x_.data(), w_.data()};
-//         void *outputs[]{y_.data()};
-//         rCpu(res, nullptr, inputs, outputs);
-//     }
-//     // check
-//     std::vector<float> result(y->elementsSize());
-//     yGpu->copyToHost(result.data(), y->bytesSize());
-//     EXPECT_EQ(result, y_);
-// }
+TEST(kernel, RmsNormalizationCuda) {
+    // build routine
+    auto y = Tensor::share(DataType::F32, Shape{2, 3, 4});
+    auto x = Tensor::share(DataType::F32, Shape{2, 3, 4});
+    auto w = Tensor::share(DataType::F32, Shape{4});
+    auto kernel = RmsNormalizationCuda::build(0, *x),
+         kCpu = RmsNormalizationCpu::build(0, *x);
+    ASSERT_TRUE(kernel && kCpu);
+    auto res = runtime::Resources();
+    auto routine = kernel->lower(res).routine,
+         rCpu = kCpu->lower(res).routine;
+    // malloc
+    auto &dev = *device::init(Device::Type::Nvidia, 0, "");
+    auto yGpu = dev.malloc(y->bytesSize()),
+         xGpu = dev.malloc(x->bytesSize()),
+         wGpu = dev.malloc(w->bytesSize());
+    // put input data
+    std::vector<float> y_(y->elementsSize());
+    std::vector<float> x_(x->elementsSize());
+    std::vector<float> w_(w->elementsSize());
+    std::iota(x_.begin(), x_.end(), 0);
+    std::iota(w_.begin(), w_.end(), 1);
+    xGpu->copyFromHost(x_.data(), x->bytesSize());
+    wGpu->copyFromHost(w_.data(), w->bytesSize());
+    // inference
+    {
+        void const *inputs[]{*xGpu, *wGpu};
+        void *outputs[]{*yGpu};
+        routine(res, nullptr, inputs, outputs);
+    }
+    {
+        void const *inputs[]{x_.data(), w_.data()};
+        void *outputs[]{y_.data()};
+        rCpu(res, nullptr, inputs, outputs);
+    }
+    // check
+    std::vector<float> result(y->elementsSize());
+    yGpu->copyToHost(result.data(), y->bytesSize());
+    for (auto i : range0_(y_.size())) {
+        EXPECT_FLOAT_EQ(result[i], y_[i]);
+    }
+}
 
-// #endif
+#endif