|
| 1 | +/* Copyright 2025 The xLLM Authors. All Rights Reserved. |
| 2 | +
|
| 3 | +Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +you may not use this file except in compliance with the License. |
| 5 | +You may obtain a copy of the License at |
| 6 | +
|
| 7 | + https://github.com/jd-opensource/xllm/blob/main/LICENSE |
| 8 | +
|
| 9 | +Unless required by applicable law or agreed to in writing, software |
| 10 | +distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +See the License for the specific language governing permissions and |
| 13 | +limitations under the License. |
| 14 | +==============================================================================*/ |
| 15 | + |
| 16 | +#include "rerank_service_impl.h" |
| 17 | + |
| 18 | +#include <glog/logging.h> |
| 19 | +#include <torch/torch.h> |
| 20 | + |
| 21 | +#include <string> |
| 22 | + |
| 23 | +#include "common/instance_name.h" |
| 24 | +#include "framework/request/request_params.h" |
| 25 | +#include "runtime/llm_master.h" |
| 26 | +#include "util/blocking_counter.h" |
| 27 | +#include "util/utils.h" |
| 28 | +#include "util/uuid.h" |
| 29 | + |
| 30 | +namespace xllm { |
| 31 | +namespace { |
| 32 | + |
| 33 | +struct RerankRequestOutput { |
| 34 | + int32_t index = 0; |
| 35 | + std::string document = ""; |
| 36 | + float score = 0.0f; |
| 37 | + |
| 38 | + RerankRequestOutput(int32_t index, std::string document, float score) |
| 39 | + : index(index), document(std::move(document)), score(score) {} |
| 40 | +}; |
| 41 | + |
| 42 | +bool send_result_to_client_brpc(std::shared_ptr<RerankCall> call, |
| 43 | + const std::string& request_id, |
| 44 | + int64_t created_time, |
| 45 | + const std::string& model, |
| 46 | + const std::vector<std::string>& documents, |
| 47 | + int32_t top_n, |
| 48 | + const std::vector<RequestOutput>& req_outputs) { |
| 49 | + auto& response = call->response(); |
| 50 | + response.set_id(request_id); |
| 51 | + response.set_model(model); |
| 52 | + |
| 53 | + // calculate cosine similarity |
| 54 | + size_t doc_size = documents.size() - 1; |
| 55 | + std::string query = documents[doc_size]; |
| 56 | + auto query_embed = req_outputs[doc_size].outputs[0].embeddings.value(); |
| 57 | + auto query_tensor = torch::from_blob( |
| 58 | + query_embed.data(), {query_embed.size()}, torch::kFloat32); |
| 59 | + |
| 60 | + std::vector<RerankRequestOutput> rerank_outputs; |
| 61 | + rerank_outputs.reserve(doc_size); |
| 62 | + for (size_t i = 0; i < doc_size; ++i) { |
| 63 | + if (req_outputs[i].outputs[0].embeddings.has_value()) { |
| 64 | + auto doc_embed = req_outputs[i].outputs[0].embeddings.value(); |
| 65 | + auto doc_tensor = torch::from_blob( |
| 66 | + doc_embed.data(), {doc_embed.size()}, torch::kFloat32); |
| 67 | + auto score = |
| 68 | + torch::cosine_similarity(query_tensor, doc_tensor, 0).item<float>(); |
| 69 | + rerank_outputs.emplace_back(i, documents[i], score); |
| 70 | + } |
| 71 | + } |
| 72 | + |
| 73 | + std::sort(rerank_outputs.begin(), |
| 74 | + rerank_outputs.end(), |
| 75 | + [](const RerankRequestOutput& a, const RerankRequestOutput& b) { |
| 76 | + return a.score > b.score; |
| 77 | + }); |
| 78 | + |
| 79 | + // add top_n results |
| 80 | + int32_t valid_top_n = std::min(top_n, static_cast<int32_t>(doc_size)); |
| 81 | + response.mutable_results()->Reserve(valid_top_n); |
| 82 | + for (int32_t i = 0; i < valid_top_n; ++i) { |
| 83 | + auto* result = response.add_results(); |
| 84 | + result->set_index(rerank_outputs[i].index); |
| 85 | + auto* document = result->mutable_document(); |
| 86 | + document->set_text(rerank_outputs[i].document); |
| 87 | + result->set_relevance_score(rerank_outputs[i].score); |
| 88 | + } |
| 89 | + |
| 90 | + // add usage statistics |
| 91 | + int32_t num_prompt_tokens = 0; |
| 92 | + int32_t num_generated_tokens = 0; |
| 93 | + int32_t num_total_tokens = 0; |
| 94 | + for (auto req_output : req_outputs) { |
| 95 | + if (req_output.usage.has_value()) { |
| 96 | + const auto& usage = req_output.usage.value(); |
| 97 | + num_prompt_tokens += usage.num_prompt_tokens; |
| 98 | + num_generated_tokens += usage.num_generated_tokens; |
| 99 | + num_total_tokens += usage.num_total_tokens; |
| 100 | + } |
| 101 | + } |
| 102 | + if (num_total_tokens > 0) { |
| 103 | + auto* proto_usage = response.mutable_usage(); |
| 104 | + proto_usage->set_prompt_tokens(num_prompt_tokens); |
| 105 | + proto_usage->set_completion_tokens(num_generated_tokens); |
| 106 | + proto_usage->set_total_tokens(num_total_tokens); |
| 107 | + } |
| 108 | + |
| 109 | + return call->write_and_finish(response); |
| 110 | +} |
| 111 | + |
| 112 | +} // namespace |
| 113 | + |
| 114 | +RerankServiceImpl::RerankServiceImpl(LLMMaster* master, |
| 115 | + const std::vector<std::string>& models) |
| 116 | + : APIServiceImpl(models), master_(master) { |
| 117 | + CHECK(master_ != nullptr); |
| 118 | +} |
| 119 | + |
| 120 | +// rerank_async for brpc |
| 121 | +void RerankServiceImpl::process_async_impl(std::shared_ptr<RerankCall> call) { |
| 122 | + const auto& rpc_request = call->request(); |
| 123 | + // check if model is supported |
| 124 | + const auto& model = rpc_request.model(); |
| 125 | + if (!models_.contains(model)) { |
| 126 | + call->finish_with_error(StatusCode::UNKNOWN, "Model not supported"); |
| 127 | + return; |
| 128 | + } |
| 129 | + |
| 130 | + std::vector<std::string> documents; |
| 131 | + if (rpc_request.documents_size() > 0) { |
| 132 | + documents = std::vector<std::string>(rpc_request.documents().begin(), |
| 133 | + rpc_request.documents().end()); |
| 134 | + } |
| 135 | + documents.emplace_back(rpc_request.query()); |
| 136 | + |
| 137 | + // create RequestParams for rerank request |
| 138 | + RequestParams request_params( |
| 139 | + rpc_request, call->get_x_request_id(), call->get_x_request_time()); |
| 140 | + std::vector<RequestParams> sps(documents.size(), request_params); |
| 141 | + auto request_id = request_params.request_id; |
| 142 | + auto created_time = absl::ToUnixSeconds(absl::Now()); |
| 143 | + |
| 144 | + // schedule the request |
| 145 | + std::vector<RequestOutput> req_outputs; |
| 146 | + req_outputs.resize(documents.size()); |
| 147 | + BlockingCounter counter(documents.size()); |
| 148 | + |
| 149 | + auto batch_callback = [&req_outputs, &counter](size_t index, |
| 150 | + RequestOutput output) -> bool { |
| 151 | + req_outputs[index] = std::move(output); |
| 152 | + counter.decrement_count(); |
| 153 | + return true; |
| 154 | + }; |
| 155 | + |
| 156 | + master_->handle_batch_request(documents, sps, batch_callback); |
| 157 | + |
| 158 | + // Wait for all tasks to complete |
| 159 | + counter.wait(); |
| 160 | + |
| 161 | + int32_t top_n = documents.size() - 1; |
| 162 | + if (rpc_request.has_top_n()) { |
| 163 | + top_n = rpc_request.top_n(); |
| 164 | + } |
| 165 | + |
| 166 | + send_result_to_client_brpc( |
| 167 | + call, request_id, created_time, model, documents, top_n, req_outputs); |
| 168 | +} |
| 169 | + |
| 170 | +} // namespace xllm |
0 commit comments