1+ /* Copyright 2025 The xLLM Authors. All Rights Reserved.
2+
3+ Licensed under the Apache License, Version 2.0 (the "License");
4+ you may not use this file except in compliance with the License.
5+ You may obtain a copy of the License at
6+
7+ https://github.com/jd-opensource/xllm/blob/main/LICENSE
8+
9+ Unless required by applicable law or agreed to in writing, software
10+ distributed under the License is distributed on an "AS IS" BASIS,
11+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+ See the License for the specific language governing permissions and
13+ limitations under the License.
14+ ==============================================================================*/
15+
16+ #include < cuda_runtime.h>
17+
18+ #include < flashinfer/activation.cuh>
19+
20+ #include " cuda_ops_api.h"
21+
22+ using namespace flashinfer ;
23+
24+ namespace xllm ::kernel::cuda {
25+
26+ __device__ __forceinline__ float silu (const float & val) {
27+ return val / (1 .0f + __expf (-val));
28+ }
29+
30+ __device__ __forceinline__ float gelu (const float & val) {
31+ constexpr float kAlpha = M_SQRT1_2;
32+ return val * 0 .5f * (1 .0f + ::erf (val * kAlpha ));
33+ }
34+
35+ __device__ __forceinline__ float gelu_tanh (const float & val) {
36+ const float cdf =
37+ 0 .5f * (1 .0f + math::tanh ((0 .7978845608028654f *
38+ (val + 0 .044715f * val * val * val))));
39+ return val * cdf;
40+ }
41+
42+ void act_and_mul (TensorView out,
43+ TensorView input,
44+ const std::string& act_mode,
45+ bool enable_pdl) {
46+ int d = input->shape [input->ndim - 1 ] / 2 ;
47+ int64_t num_tokens = input.numel () / input->shape [input->ndim - 1 ];
48+ dim3 grid (num_tokens);
49+
50+ cudaSetDevice (out->device .device_id );
51+ const cudaStream_t stream = get_stream (out->device );
52+ DISPATCH_DLPACK_DTYPE_TO_CTYPE_FP16 (input->dtype , c_type, [&] {
53+ uint32_t vec_size = 16 / sizeof (c_type);
54+ cudaLaunchConfig_t config;
55+ config.gridDim = num_tokens;
56+ config.blockDim = std::min (d / vec_size, 1024U );
57+ config.dynamicSmemBytes = 0 ;
58+ config.stream = stream;
59+ cudaLaunchAttribute attrs[1 ];
60+ attrs[0 ].id = cudaLaunchAttributeProgrammaticStreamSerialization;
61+ attrs[0 ].val .programmaticStreamSerializationAllowed = enable_pdl;
62+ config.numAttrs = 1 ;
63+ config.attrs = attrs;
64+
65+ auto kernel = activation::act_and_mul_kernel<c_type, act_mode>;
66+
67+ cudaLaunchKernelEx (&config,
68+ kernel,
69+ static_cast <c_type*>(out->data ),
70+ static_cast <c_type*>(input->data ),
71+ d);
72+
73+ cudaError_t err = cudaGetLastError ();
74+ TVM_FFI_ICHECK (err == cudaSuccess)
75+ << " Failed to launch kernel: " << cudaGetErrorString (err);
76+
77+ return true ;
78+ });
79+ }
80+
81+ } // namespace xllm::kernel::cuda
0 commit comments