jd-opensource
diff --git a/‎.gitmodules‎
Lines changed: 6 additions & 0 deletions b/‎.gitmodules‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎third_party/CMakeLists.txt‎
Lines changed: 21 additions & 0 deletions b/‎third_party/CMakeLists.txt‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎third_party/cutlass‎ b/‎third_party/cutlass‎
diff --git a/‎third_party/flashinfer‎ b/‎third_party/flashinfer‎
diff --git a/‎xllm/core/framework/batch/batch_input_builder.cpp‎
Lines changed: 11 additions & 3 deletions b/‎xllm/core/framework/batch/batch_input_builder.cpp‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎xllm/core/framework/batch/batch_input_builder.h‎
Lines changed: 5 additions & 1 deletion b/‎xllm/core/framework/batch/batch_input_builder.h‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎xllm/core/framework/model/model_input_params.h‎
Lines changed: 21 additions & 0 deletions b/‎xllm/core/framework/model/model_input_params.h‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎xllm/core/kernels/CMakeLists.txt‎
Lines changed: 19 additions & 3 deletions b/‎xllm/core/kernels/CMakeLists.txt‎
Lines changed: 19 additions & 3 deletions
diff --git a/‎xllm/core/kernels/cuda/CMakeLists.txt‎
Lines changed: 20 additions & 0 deletions b/‎xllm/core/kernels/cuda/CMakeLists.txt‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎xllm/core/kernels/cuda/active.cpp‎
Lines changed: 81 additions & 0 deletions b/‎xllm/core/kernels/cuda/active.cpp‎
Lines changed: 81 additions & 0 deletions
@@ -28,3 +28,9 @@
 [submodule "third_party/Mooncake"]
 	path = third_party/Mooncake
 	url = https://gitcode.com/xLLM-AI/Mooncake.git
+[submodule "third_party/flashinfer"]
+	path = third_party/flashinfer
+	url = https://gitcode.com/xLLM-AI/flashinfer.git
+[submodule "third_party/cutlass"]
+	path = third_party/cutlass
+	url = https://gitcode.com/xLLM-AI/cutlass.git
@@ -20,3 +20,24 @@ target_include_directories(mooncake_store PUBLIC
 )
 
 target_link_libraries(mooncake_store PUBLIC transfer_engine cachelib_memory_allocator)
+
+
+if(USE_CUDA)
+  cc_library(
+    NAME 
+      cutlass
+      INCLUDES
+      cutlass/include
+      cutlass/tools/util/include
+    DEPS 
+      torch # TODO: depends on CUDA instead of torch
+  )
+  cc_library(
+    NAME 
+      flashinfer
+      INCLUDES
+      flashinfer/include
+    DEPS
+      cutlass
+  )
+endif()
@@ -212,7 +212,7 @@ void BatchInputBuilder::process_sequences_multithreaded(uint32_t start_idx,
     state_.q_seq_lens.insert(state_.q_seq_lens.end(),
                              state.q_seq_lens.begin(),
                              state.q_seq_lens.end());
-#elif defined(USE_MLU)
+#elif defined(USE_MLU) || defined(USE_CUDA)
     int32_t seq_len_offset = state_.seq_lens.back();
     // skip the first element which is 0
     for (size_t i = 1; i < state.seq_lens.size(); ++i) {
@@ -284,7 +284,7 @@ void BatchInputBuilder::process_single_sequence(
 #if defined(USE_NPU)
   state.seq_lens.push_back(seq_len);
   state.q_seq_lens.push_back(q_seq_len);
-#elif defined(USE_MLU)
+#elif defined(USE_MLU) || defined(USE_CUDA)
   state.seq_lens.push_back(state.seq_lens.back() + seq_len);
   state.q_seq_lens.push_back(state.q_seq_lens.back() + q_seq_len);
 #endif
@@ -437,7 +437,12 @@ void BatchInputBuilder::setup_kv_cache_info(
     block_size = block.size();
     block_ids.push_back(block.id());
     u_block_ids.emplace_back(block.id());
+    state.paged_kv_indices.push_back(block.id());
   }
+  state.paged_kv_indptr.push_back(state.paged_kv_indptr.back() + blocks.size());
+  int32_t last_page_len =
+      (seq_len % block_size == 0) ? block_size : seq_len % block_size;
+  state.paged_kv_last_page_len.push_back(last_page_len);
 
   int32_t kv_cache_block_idx = n_kv_cache_tokens / block_size;
   for (auto iter = block_ids.begin() + kv_cache_block_idx;
@@ -506,12 +511,15 @@ void BatchInputBuilder::padding_decode_batch_size(
 #if defined(USE_NPU)
         state_.seq_lens.push_back(num_decoding_tokens);
         state_.q_seq_lens.push_back(num_decoding_tokens);
-#elif defined(USE_MLU)
+#elif defined(USE_MLU) || defined(USE_CUDA)
         state_.seq_lens.push_back(state_.seq_lens.back() + num_decoding_tokens);
         state_.q_seq_lens.push_back(state_.q_seq_lens.back() +
                                     num_decoding_tokens);
 #endif
         state_.block_tables_vec.emplace_back();
+        state_.paged_kv_indices.push_back(0);
+        state_.paged_kv_indptr.push_back(state_.paged_kv_indptr.back() + 1);
+        state_.paged_kv_last_page_len.push_back(1);
       }
     }
   }
 
@@ -104,6 +104,11 @@ class BatchInputBuilder {
     // for continuous kvcache
     std::vector<int64_t> new_cache_slot_offsets;  //[n_tokens]
     std::vector<int64_t> kv_cache_start_offsets;  //[n_seq]
+
+    // for flashinfer
+    std::vector<int32_t> paged_kv_indptr = {0};
+    std::vector<int32_t> paged_kv_indices;
+    std::vector<int32_t> paged_kv_last_page_len;
   };
 
   // Helper methods for sequence processing
@@ -128,7 +133,6 @@ class BatchInputBuilder {
       uint32_t q_seq_len,
       BuilderState* state_ptr = nullptr,
       std::unordered_set<int32_t>* write_block_ids_ptr = nullptr);
-
   void setup_continuous_kv_cache_info(Sequence* sequence,
                                       uint32_t n_kv_cache_tokens,
                                       uint32_t seq_len,
 
@@ -93,6 +93,12 @@ struct ModelInputParams {
 
     // Copy graph_buffer to device
     params.graph_buffer = safe_to(graph_buffer, device, true);
+
+    // params for flashinfer
+    params.paged_kv_indptr = safe_to(paged_kv_indptr, device);
+    params.paged_kv_indices = safe_to(paged_kv_indices, device);
+    params.paged_kv_last_page_len = safe_to(paged_kv_last_page_len, device);
+
     return params;
   }
 
@@ -192,6 +198,21 @@ struct ModelInputParams {
   // Graph execution buffer for temporary tensor storage
   // Used by ACL Graph Executor to avoid repeated memory allocation
   torch::Tensor graph_buffer;
+
+  // the indptr of the paged kv-cache
+  // used in flashinfer
+  // IntTensor: [n_seq + 1]
+  torch::Tensor paged_kv_indptr;
+
+  // the page indices of the paged kv cache
+  // used in flashinfer
+  torch::Tensor paged_kv_indices;
+
+  // the number of entries in the last page of each request in
+  // the paged kv cache
+  // used in flashinfer
+  // IntTensor: [n_seq]
+  torch::Tensor paged_kv_last_page_len;
 };
 
 }  // namespace xllm
@@ -1,12 +1,28 @@
 include(cc_library)
 
 if(USE_NPU)
-  include_directories(
-    ${CMAKE_SOURCE_DIR}/third_party/spdlog/include
-  )
   add_subdirectory(npu)
 endif()
 
 if(USE_MLU)
   add_subdirectory(mlu)
 endif()
+
+if(USE_CUDA)
+  add_subdirectory(cuda)
+endif()
+
+cc_library(
+  NAME
+    kernels
+  HDRS
+    param.h
+    ops_api.h
+  SRCS
+    ops_api.cpp
+  DEPS
+    torch
+    $<$<BOOL:${USE_NPU}>:npu_kernels>
+    $<$<BOOL:${USE_MLU}>:mlu_kernels>
+    $<$<BOOL:${USE_CUDA}>:cuda_kernels>
+)
@@ -0,0 +1,20 @@
+include(cc_library)
+
+file(GLOB_RECURSE CUDA_HEADER_FILES
+  "${CMAKE_CURRENT_LIST_DIR}/*.h"
+)
+
+file(GLOB_RECURSE CUDA_SOURCE_FILES
+  "${CMAKE_CURRENT_LIST_DIR}/*.cpp"
+)
+
+cc_library(
+  NAME
+    cuda_kernels
+  HDRS
+    ${CUDA_HEADER_FILES}
+  SRCS
+    ${CUDA_SOURCE_FILES}
+  DEPS
+    flashinfer
+)
@@ -0,0 +1,81 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cuda_runtime.h>
+
+#include <flashinfer/activation.cuh>
+
+#include "cuda_ops_api.h"
+
+using namespace flashinfer;
+
+namespace xllm::kernel::cuda {
+
+__device__ __forceinline__ float silu(const float& val) {
+  return val / (1.0f + __expf(-val));
+}
+
+__device__ __forceinline__ float gelu(const float& val) {
+  constexpr float kAlpha = M_SQRT1_2;
+  return val * 0.5f * (1.0f + ::erf(val * kAlpha));
+}
+
+__device__ __forceinline__ float gelu_tanh(const float& val) {
+  const float cdf =
+      0.5f * (1.0f + math::tanh((0.7978845608028654f *
+                                 (val + 0.044715f * val * val * val))));
+  return val * cdf;
+}
+
+void act_and_mul(TensorView out,
+                 TensorView input,
+                 const std::string& act_mode,
+                 bool enable_pdl) {
+  int d = input->shape[input->ndim - 1] / 2;
+  int64_t num_tokens = input.numel() / input->shape[input->ndim - 1];
+  dim3 grid(num_tokens);
+
+  cudaSetDevice(out->device.device_id);
+  const cudaStream_t stream = get_stream(out->device);
+  DISPATCH_DLPACK_DTYPE_TO_CTYPE_FP16(input->dtype, c_type, [&] {
+    uint32_t vec_size = 16 / sizeof(c_type);
+    cudaLaunchConfig_t config;
+    config.gridDim = num_tokens;
+    config.blockDim = std::min(d / vec_size, 1024U);
+    config.dynamicSmemBytes = 0;
+    config.stream = stream;
+    cudaLaunchAttribute attrs[1];
+    attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+    attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
+    config.numAttrs = 1;
+    config.attrs = attrs;
+
+    auto kernel = activation::act_and_mul_kernel<c_type, act_mode>;
+
+    cudaLaunchKernelEx(&config,
+                       kernel,
+                       static_cast<c_type*>(out->data),
+                       static_cast<c_type*>(input->data),
+                       d);
+
+    cudaError_t err = cudaGetLastError();
+    TVM_FFI_ICHECK(err == cudaSuccess)
+        << "Failed to launch kernel: " << cudaGetErrorString(err);
+
+    return true;
+  });
+}
+
+}  // namespace xllm::kernel::cuda